In [2]:
#pip install gensim -q

In [3]:
# Lets import thr libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
from gensim.models import  Word2Vec
from gensim import downloader as gen_download
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Lets load the data set
data = pd.read_csv(r'movies.csv')

In [5]:
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
data.shape

(9742, 3)

In [7]:
# Check for missing values

data.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [8]:
# Lets convert text to tokens
data['genre_tokens']= data['genres'].str.lower().str.split(pat='|')

In [9]:
data

Unnamed: 0,movieId,title,genres,genre_tokens
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]"
4,5,Father of the Bride Part II (1995),Comedy,[comedy]
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[action, animation, comedy, fantasy]"
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[animation, comedy, fantasy]"
9739,193585,Flint (2017),Drama,[drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[action, animation]"


In [10]:
embed_model = Word2Vec(sentences=data['genre_tokens'],window=2,sg=1,vector_size=20)

In [11]:
vocabulary  = embed_model.wv.index_to_key

In [12]:
vocabulary

['drama',
 'comedy',
 'thriller',
 'action',
 'romance',
 'adventure',
 'crime',
 'sci-fi',
 'horror',
 'fantasy',
 'children',
 'animation',
 'mystery',
 'documentary',
 'war',
 'musical',
 'western',
 'imax',
 'film-noir',
 '(no genres listed)']

In [13]:
# We need to remove the movies where genre is not provided

data[data['genres']=='(no genres listed)'].shape

(34, 4)

In [14]:
drop_index = data[data['genres']=='(no genres listed)'].index
data.drop(drop_index,inplace=True)
data.reset_index(inplace=True)

In [15]:
# Lets train SkipGram model on our data and genrate word vectors for tokens

embed_model = Word2Vec(sentences=data['genre_tokens'],window=2,sg=1,vector_size=20)

In [16]:
embed_model.wv['children']

array([-0.41108224,  0.29812026,  0.0709483 , -0.05380898,  0.3122989 ,
       -0.00651237,  0.13406953,  0.5435593 , -0.35477185,  0.15928297,
        0.2206962 , -0.25023833, -0.02468278, -0.20791985,  0.28953198,
       -0.04461643,  0.6387501 ,  0.10124215, -0.49341956, -0.08215634],
      dtype=float32)

In [17]:

embed_model.wv.most_similar('children',topn=5)

[('action', 0.9905017614364624),
 ('comedy', 0.9876409769058228),
 ('fantasy', 0.987295925617218),
 ('adventure', 0.9869195818901062),
 ('horror', 0.9868780374526978)]

In [18]:
embed_model.wv.most_similar('romance',topn=5)

[('western', 0.9860139489173889),
 ('musical', 0.9857547283172607),
 ('fantasy', 0.9854267239570618),
 ('adventure', 0.9841161966323853),
 ('thriller', 0.9834862351417542)]

In [19]:
embed_model.wv.most_similar('horror',topn=5)

[('animation', 0.9926390647888184),
 ('mystery', 0.9891402125358582),
 ('fantasy', 0.9888047575950623),
 ('adventure', 0.9887950420379639),
 ('children', 0.9868779182434082)]

In [20]:
embed_model.wv.key_to_index

{'drama': 0,
 'comedy': 1,
 'thriller': 2,
 'action': 3,
 'romance': 4,
 'adventure': 5,
 'crime': 6,
 'sci-fi': 7,
 'horror': 8,
 'fantasy': 9,
 'children': 10,
 'animation': 11,
 'mystery': 12,
 'documentary': 13,
 'war': 14,
 'musical': 15,
 'western': 16,
 'imax': 17,
 'film-noir': 18}

In [None]:
# Lets try some pretrained model (Glove)

#model_glove = gen_download.load('glove-twitter-25')



In [23]:
model_glove['horror']

array([ 0.67846 , -0.26964 ,  0.61851 ,  0.046141,  0.61987 ,  0.051023,
        1.8133  ,  0.8274  ,  0.30019 , -0.5699  , -0.15057 , -0.38452 ,
       -2.1337  , -0.017248,  0.89657 ,  1.1567  ,  0.14458 ,  0.61671 ,
       -0.13564 ,  0.067028, -0.63978 ,  0.69494 , -0.69101 , -0.037431,
        0.98344 ], dtype=float32)

In [24]:
model_glove['documentary']

array([ 0.30368 ,  0.081423,  0.79418 , -0.3667  ,  0.61585 , -0.89648 ,
        1.6116  , -0.74739 ,  0.21872 , -1.4537  ,  0.52975 ,  1.3648  ,
       -2.4574  ,  0.18059 ,  0.35748 ,  0.44751 ,  0.011323,  0.77678 ,
       -0.10042 ,  0.33302 , -0.40845 , -0.24913 , -0.86413 , -1.3932  ,
        0.70769 ], dtype=float32)

In [25]:
# Lets convert these tokens to word embedding

data['genre_embeddings']=data['genre_tokens'].apply(lambda x: [embed_model.wv[w] for w in x])

In [26]:
data

Unnamed: 0,index,movieId,title,genres,genre_tokens,genre_embeddings
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]","[[-0.5143289, 0.33215776, 0.1572167, -0.016035..."
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]","[[-0.5143289, 0.33215776, 0.1572167, -0.016035..."
2,2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]","[[-0.35680535, 0.29324135, 0.1562389, -0.03271..."
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]","[[-0.35680535, 0.29324135, 0.1562389, -0.03271..."
4,4,5,Father of the Bride Part II (1995),Comedy,[comedy],"[[-0.35680535, 0.29324135, 0.1562389, -0.03271..."
...,...,...,...,...,...,...
9703,9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[action, animation, comedy, fantasy]","[[-0.42611626, 0.28422192, 0.101642065, -0.091..."
9704,9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[animation, comedy, fantasy]","[[-0.4753241, 0.26715463, 0.10758995, -0.08804..."
9705,9739,193585,Flint (2017),Drama,[drama],"[[-0.40962023, 0.27601847, 0.14672746, 0.00427..."
9706,9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[action, animation]","[[-0.42611626, 0.28422192, 0.101642065, -0.091..."


In [27]:
data['genre_embeddings'][0]

[array([-0.5143289 ,  0.33215776,  0.1572167 , -0.01603539,  0.34818608,
        -0.09828231,  0.17352735,  0.61839956, -0.34637272,  0.10033538,
         0.26630503, -0.3111355 , -0.13381551, -0.1855704 ,  0.2834204 ,
         0.02408323,  0.6924809 ,  0.08740503, -0.49888048, -0.04660406],
       dtype=float32),
 array([-0.4753241 ,  0.26715463,  0.10758995, -0.08804511,  0.32671514,
        -0.08806099,  0.20822042,  0.5751085 , -0.34729606,  0.14190836,
         0.28311798, -0.29299524, -0.13515098, -0.22451544,  0.20155232,
        -0.01493608,  0.62066513,  0.06214178, -0.44834253, -0.04568424],
       dtype=float32),
 array([-0.41108224,  0.29812026,  0.0709483 , -0.05380898,  0.3122989 ,
        -0.00651237,  0.13406953,  0.5435593 , -0.35477185,  0.15928297,
         0.2206962 , -0.25023833, -0.02468278, -0.20791985,  0.28953198,
        -0.04461643,  0.6387501 ,  0.10124215, -0.49341956, -0.08215634],
       dtype=float32),
 array([-0.35680535,  0.29324135,  0.1562389 , -0.03

In [28]:
data['genre_avg_embeddings'] = data['genre_embeddings'].apply(lambda x: np.mean(x,axis=0))

In [29]:
data

Unnamed: 0,index,movieId,title,genres,genre_tokens,genre_embeddings,genre_avg_embeddings
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]","[[-0.5143289, 0.33215776, 0.1572167, -0.016035...","[-0.44554454, 0.29837385, 0.12681885, -0.05525..."
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]","[[-0.5143289, 0.33215776, 0.1572167, -0.016035...","[-0.46519768, 0.31049106, 0.12342179, -0.05184..."
2,2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]","[[-0.35680535, 0.29324135, 0.1562389, -0.03271...","[-0.34357285, 0.2622468, 0.13655776, -0.014070..."
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]","[[-0.35680535, 0.29324135, 0.1562389, -0.03271...","[-0.36558867, 0.26683736, 0.13994765, -0.00795..."
4,4,5,Father of the Bride Part II (1995),Comedy,[comedy],"[[-0.35680535, 0.29324135, 0.1562389, -0.03271...","[-0.35680535, 0.29324135, 0.1562389, -0.032717..."
...,...,...,...,...,...,...,...
9703,9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[action, animation, comedy, fantasy]","[[-0.42611626, 0.28422192, 0.101642065, -0.091...","[-0.4321069, 0.28645328, 0.12689282, -0.074503..."
9704,9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[animation, comedy, fantasy]","[[-0.4753241, 0.26715463, 0.10758995, -0.08804...","[-0.43410382, 0.28719702, 0.13530974, -0.06881..."
9705,9739,193585,Flint (2017),Drama,[drama],"[[-0.40962023, 0.27601847, 0.14672746, 0.00427...","[-0.40962023, 0.27601847, 0.14672746, 0.004270..."
9706,9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[action, animation]","[[-0.42611626, 0.28422192, 0.101642065, -0.091...","[-0.4507202, 0.2756883, 0.10461601, -0.0898071..."


In [30]:
data['genre_avg_embeddings'][0]

array([-0.44554454,  0.29837385,  0.12681885, -0.0552579 ,  0.31207046,
       -0.06686637,  0.16196528,  0.5520607 , -0.33363104,  0.11867801,
        0.24351458, -0.26678258, -0.08050869, -0.22129646,  0.24361996,
       -0.01075135,  0.6405906 ,  0.05636381, -0.46952447, -0.05180441],
      dtype=float32)

## Recomendation System

In [31]:
# create a similarity matrix

simi_matrix = cosine_similarity(data['genre_avg_embeddings'].to_list())

In [32]:
simi_matrix

array([[0.99999994, 0.9994908 , 0.9958259 , ..., 0.9906257 , 0.9973638 ,
        0.99476284],
       [0.9994908 , 1.        , 0.9946733 , ..., 0.98910356, 0.9958661 ,
        0.99250716],
       [0.9958259 , 0.9946733 , 1.        , ..., 0.9880812 , 0.99160606,
        0.99632967],
       ...,
       [0.9906257 , 0.98910356, 0.9880812 , ..., 0.9999998 , 0.9910584 ,
        0.9863493 ],
       [0.9973638 , 0.9958661 , 0.99160606, ..., 0.9910584 , 0.99999994,
        0.9904575 ],
       [0.99476284, 0.99250716, 0.99632967, ..., 0.9863493 , 0.9904575 ,
        1.0000002 ]], shape=(9708, 9708), dtype=float32)

In [33]:
def recommender(selected_movie,nos=5):
  if selected_movie in data['title'].values:
    idx = data[data['title']==selected_movie].index[0]
    top_n_idx = simi_matrix[idx].argsort()[::-1][1:nos+1]
    for i in top_n_idx:
      print(data['title'].iloc[i])
  else:
      print('Movie not Found')


In [34]:
movie_name = 'Black Butler: Book of the Atlantic (2017)'
data[data['title']==movie_name].index[0]

np.int64(9703)

In [35]:

simi_matrix[9703].argsort()[::-1][1:6]

array([9270, 9431, 8545, 8128, 8722])

In [36]:
recommender('Jumanji (1995)')

Seventh Son (2014)
The Cave of the Golden Rose (1991)
Percy Jackson: Sea of Monsters (2013)
Pan (2015)
Chronicles of Narnia: Prince Caspian, The (2008)
