# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

In [2]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install pymystem3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install gensim==4.3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.3.0
  Downloading gensim-4.3.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Collecting FuzzyTM>=0.4.0
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.1/67.1 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting fst-pso
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting simpful
  Downloading simpful-2.10.0-py3-none-any.whl (31 kB)
Collecting miniful
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fst-pso, miniful
  Building wheel for fst-pso (

# 1. Modules and functions

In [5]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
gensim.__version__

'4.3.0'

## 1.1. Helper functions to avoid copypaste

In [7]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [10]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

# 2. Main

## 2.1. Data Preparation

In [11]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [12]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [18]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [19]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

In [20]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [21]:
sample

Unnamed: 0,model_index,id,original_title,overview
0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...,...
45461,45461,439050,رگ خواب,Rising and falling between a man and woman.
45462,45462,111109,Siglo ng Pagluluwal,An artist struggles to finish his work while a...
45463,45463,67758,Betrayal,"When one of her hits goes wrong, a professiona..."
45464,45464,227506,Satana likuyushchiy,"In a small town live two brothers, one a minis..."


In [22]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [34]:
movies_inv_mapper

{'toy story': 0,
 'jumanji': 1,
 'grumpier old men': 2,
 'waiting to exhale': 3,
 'father of the bride part ii': 4,
 'heat': 29042,
 'sabrina': 888,
 'tom and huck': 7,
 'sudden death': 8,
 'goldeneye': 9,
 'the american president': 10,
 'dracula: dead and loving it': 11,
 'balto': 12,
 'nixon': 13,
 'cutthroat island': 14,
 'casino': 15,
 'sense and sensibility': 41042,
 'four rooms': 17,
 'ace ventura: when nature calls': 18,
 'money train': 19,
 'get shorty': 20,
 'copycat': 21,
 'assassins': 22,
 'powder': 23,
 'leaving las vegas': 24,
 'othello': 21274,
 'now and then': 26,
 'persuasion': 40837,
 'la cité des enfants perdus': 28,
 '摇啊摇，摇到外婆桥': 29,
 'dangerous minds': 30,
 'twelve monkeys': 31,
 'guillaumet, les ailes du courage': 32,
 'babe': 33,
 'carrington': 34,
 'dead man walking': 35,
 'across the sea of time': 36,
 'it takes two': 29129,
 'clueless': 38,
 'cry, the beloved country': 26667,
 'richard iii': 17719,
 'dead presidents': 41,
 'restoration': 38571,
 'mortal kombat'

In [23]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_corpus[:1]

["Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."]

In [24]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [26]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['trapped', 'siblings', 'inside', 'running', 'hope', 'board', 'judy', 'invite', 'adult', 'opens', 'risky', 'years', 'alan', 'freedom', 'proves', 'terrifying', 'door', 'world', 'enchanted', 'monkeys', 'finish', 'creatures', 'discover', 'game', 'living', 'three', 'giant', 'room', 'rhinoceroses', 'magical', 'unwittingly', 'find', 'evil', 'peter'], tags=['1'])

# 2.2. Model Training and Evaluation

In [27]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [28]:
# initialize
model = gensim.models.doc2vec.Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [31]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)



In [32]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [35]:
# get id
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [36]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [37]:
movie_embeddings = movies_vectors[movie_id]

In [38]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,5713,0.963857
2,13835,0.963388
3,18468,0.960964
4,19227,0.955975


In [40]:
# reverse values and indices to map names in dataframe
name_mapper = {v: k for k, v in movies_inv_mapper.items()}

In [41]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,5713,0.963857,rollover
2,13835,0.963388,k2
3,18468,0.960964,the incredible petrified world
4,19227,0.955975,carbon nation
5,7772,0.953099,this island earth
6,43461,0.952637,megafault
7,29872,0.952344,angels die hard
8,43165,0.951495,the zookeeper's wife
9,35181,0.951366,конек-горбунок


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

In [42]:
second_sample = movies_metadata[['id', 'original_title', 'overview', 'tagline']].copy()
second_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
 3   tagline         20412 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


In [43]:
second_sample.loc[second_sample['overview'].isnull(), 'overview'] = second_sample.loc[second_sample['overview'].isnull(), 'original_title']
second_sample.loc[second_sample['tagline'].isnull(), 'tagline'] = second_sample.loc[second_sample['tagline'].isnull(), 'original_title']
sample.isnull().sum()

model_index       0
id                0
original_title    0
overview          0
dtype: int64

In [152]:
second_sample = second_sample.reset_index().rename(columns = {'index': 'model_index'})
second_sample['model_index'] = second_sample['model_index'].astype(str)

In [153]:
second_sample.head()

Unnamed: 0,model_index,model_index.1,id,original_title,overview,tagline,merged
0,0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,When siblings Judy and Peter discover an encha...
2,2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,A family wedding reignites the ancient feud be...
3,3,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"Cheated on, mistreated and stepped on, the wom..."
4,4,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Just when George Banks has recovered from his ...


# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

In [154]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')


    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc


In [155]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model

In [62]:
second_sample['merged'] = second_sample['overview'] + second_sample['tagline']
new_corpus = get_clean_tags_array(second_sample, 'merged')

In [70]:
movies_inv_mapper = dict(zip(second_sample['original_title'].str.lower(), second_sample['model_index'].astype(int)))

In [76]:
new_model = train_embeddings(new_corpus)

In [173]:
def get_recommendations(movies : list,
                        model):
  movies_vectors = model.dv.vectors

  frames = []
  final_out = pd.DataFrame(data = None, columns= ['model_index',  'model_score', 'title_name'])
  for m in movies:
    movie_id = movies_inv_mapper[m]
    movie_embeddings = movies_vectors[movie_id]

    similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
    output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])

    name_mapper = {v: k for k, v in movies_inv_mapper.items()}

    output['title_name'] = output['model_index'].astype(int).map(name_mapper)
    frames.append(output)
    new_df = pd.concat(frames)
    new_df = new_df.sort_values(by=['model_score'], ascending=False, ignore_index=True).reset_index(drop=True)
  
  new_df = new_df.loc[np.r_[len(movies):20 + len(movies)], :].reset_index(drop=True)

  return new_df

In [176]:
# pass film name to the function as a list
recs = get_recommendations(['batman', 'rollover'], new_model)

In [177]:
recs

Unnamed: 0,model_index,model_score,title_name
0,37879,0.955346,let's go navy!
1,41083,0.95526,bon bini holland
2,33414,0.952598,the pie-covered wagon
3,43285,0.951841,win it all
4,19698,0.951358,chasing mavericks
5,19964,0.950943,stand up and fight
6,2411,0.950156,the towering inferno
7,30850,0.949118,aloha scooby-doo!
8,27438,0.949055,ドライブ
9,45136,0.949015,banana
