<a href="https://colab.research.google.com/github/jgroubert14/Movie-Book-Recommendation-System/blob/main/Movie_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.test.utils import common_texts

# Train a model with example dataset
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

In [2]:
vector = model.wv['computer']  # get numpy vector of a word
sims = model.wv.most_similar('computer', topn=10)  # get other similar words

print("Word2Vec embedding vector: ", vector)
print("Similar words of 'computer': ", sims)

Word2Vec embedding vector:  [-0.00515774 -0.00667028 -0.0077791   0.00831315 -0.00198292 -0.00685696
 -0.0041556   0.00514562 -0.00286997 -0.00375075  0.0016219  -0.0027771
 -0.00158482  0.0010748  -0.00297881  0.00852176  0.00391207 -0.00996176
  0.00626142 -0.00675622  0.00076966  0.00440552 -0.00510486 -0.00211128
  0.00809783 -0.00424503 -0.00763848  0.00926061 -0.00215612 -0.00472081
  0.00857329  0.00428459  0.0043261   0.00928722 -0.00845554  0.00525685
  0.00203994  0.0041895   0.00169839  0.00446543  0.0044876   0.0061063
 -0.00320303 -0.00457706 -0.00042664  0.00253447 -0.00326412  0.00605948
  0.00415534  0.00776685  0.00257002  0.00811905 -0.00138761  0.00808028
  0.0037181  -0.00804967 -0.00393476 -0.0024726   0.00489447 -0.00087241
 -0.00283173  0.00783599  0.00932561 -0.0016154  -0.00516075 -0.00470313
 -0.00484746 -0.00960562  0.00137242 -0.00422615  0.00252744  0.00561612
 -0.00406709 -0.00959937  0.00154715 -0.00670207  0.0024959  -0.00378173
  0.00708048  0.00064041 

In [3]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys())) # Show a list of available pretrained models

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [4]:
model = gensim.downloader.load("word2vec-google-news-300")



In [5]:
model.most_similar("cat") # Similar words

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]

In [6]:
# vector operation on word embeddings
model.most_similar(positive=['woman', 'king'], negative=['man']) # king - man + woman

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]

In [7]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall('.')

In [9]:
import pandas as pd

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'unix_timestamp'])
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [10]:
movies = pd.read_csv('ml-100k/u.item', sep='|', usecols=range(2), names=['movie_id', 'title'], encoding='latin-1')
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [11]:
ratings = ratings[ratings.rating >= 3]
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
5,298,474,4,884182806
7,253,465,5,891628467
8,305,451,3,886324817
...,...,...,...,...
99992,721,262,3,877137285
99994,378,78,3,880056976
99995,880,476,3,880175444
99996,716,204,5,879795543


In [12]:
from collections import defaultdict
import networkx as nx

pairs = defaultdict(int)
for group in ratings.groupby("user_id"):
    user_movies = list(group[1]["movie_id"]) # Retrieve the list of movies liked by the current user
    for i in range(len(user_movies)):
            for j in range(i+1, len(user_movies)):
                pairs[(user_movies[i], user_movies[j])] += 1 # increment a counter specific to a pair of movies every time they are seen together in the same list

G = nx.Graph()
for pair in pairs:
    movie1, movie2 = pair
    score = pairs[pair]
    if score >= 20: # if the movie is liked by same users more than 20 times, we consider the connection exist
        G.add_edge(movie1, movie2, weight=score)

In [13]:
from node2vec import Node2Vec

node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)
model = node2vec.fit(window=10, min_count=1, batch_words=4) # 5 nodes before, 5 nodes after

ModuleNotFoundError: ignored

In [14]:
!pip install Node2Vec

Collecting Node2Vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from Node2Vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, Node2Vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2
    Uninstalling networkx-3.2:
      Successfully uninstalled networkx-3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.[0m[31m
[0mSuccessfully installed Node2Vec-0.4.6 networkx-2.8.8


In [1]:
!pip install fastapi

Collecting fastapi
  Downloading fastapi-0.104.1-py3-none-any.whl (92 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/92.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m61.4/92.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.28.0,>=0.27.0 (from fastapi)
  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-extensions>=4.8.0 (from fastapi)
  Downloading typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Installing collected packages: typing-extensions, starlette, fastapi
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5

In [2]:
!pip install kaleido
!pip install python-multipart
!pip install uvicorn

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.[0m[31m
[0mSuccessfully installed kaleido-0.2.1
Collecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-multipart
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is th

In [3]:
from node2vec import Node2Vec

node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)
model = node2vec.fit(window=10, min_count=1, batch_words=4) # 5 nodes before, 5 nodes after

NameError: ignored

In [4]:
from collections import defaultdict
import networkx as nx

pairs = defaultdict(int)
for group in ratings.groupby("user_id"):
    user_movies = list(group[1]["movie_id"]) # Retrieve the list of movies liked by the current user
    for i in range(len(user_movies)):
            for j in range(i+1, len(user_movies)):
                pairs[(user_movies[i], user_movies[j])] += 1 # increment a counter specific to a pair of movies every time they are seen together in the same list

G = nx.Graph()
for pair in pairs:
    movie1, movie2 = pair
    score = pairs[pair]
    if score >= 20: # if the movie is liked by same users more than 20 times, we consider the connection exist
        G.add_edge(movie1, movie2, weight=score)

NameError: ignored

In [5]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall('.')

In [6]:
import pandas as pd

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'unix_timestamp'])
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [7]:
movies = pd.read_csv('ml-100k/u.item', sep='|', usecols=range(2), names=['movie_id', 'title'], encoding='latin-1')
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [8]:
ratings = ratings[ratings.rating >= 3]
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
5,298,474,4,884182806
7,253,465,5,891628467
8,305,451,3,886324817
...,...,...,...,...
99992,721,262,3,877137285
99994,378,78,3,880056976
99995,880,476,3,880175444
99996,716,204,5,879795543


In [9]:
from collections import defaultdict
import networkx as nx

pairs = defaultdict(int)
for group in ratings.groupby("user_id"):
    user_movies = list(group[1]["movie_id"]) # Retrieve the list of movies liked by the current user
    for i in range(len(user_movies)):
            for j in range(i+1, len(user_movies)):
                pairs[(user_movies[i], user_movies[j])] += 1 # increment a counter specific to a pair of movies every time they are seen together in the same list

G = nx.Graph()
for pair in pairs:
    movie1, movie2 = pair
    score = pairs[pair]
    if score >= 20: # if the movie is liked by same users more than 20 times, we consider the connection exist
        G.add_edge(movie1, movie2, weight=score)

In [10]:
from node2vec import Node2Vec

node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)
model = node2vec.fit(window=10, min_count=1, batch_words=4) # 5 nodes before, 5 nodes after

Computing transition probabilities:   0%|          | 0/615 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [01:42<00:00,  1.95it/s]


In [11]:
def recommend(movie):
    movie_id = str(movies[movies.title == movie].movie_id.values[0])
    for id in model.wv.most_similar(movie_id)[:5]:
        title = movies[movies.movie_id == int(id[0])].title.values[0]
        print(f'{title}: {id[1]:.2f}')

In [13]:
recommend("Star Wars (1977)")

Return of the Jedi (1983): 0.81
Empire Strikes Back, The (1980): 0.64
Raiders of the Lost Ark (1981): 0.58
Fugitive, The (1993): 0.56
Indiana Jones and the Last Crusade (1989): 0.56


In [14]:
recommend("Toy Story (1995)")

Star Trek: First Contact (1996): 0.66
Mission: Impossible (1996): 0.64
Independence Day (ID4) (1996): 0.62
Willy Wonka and the Chocolate Factory (1971): 0.58
Rock, The (1996): 0.58
