In [208]:
import warnings
warnings.filterwarnings("ignore")

In [209]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD
import random
random.seed(0)
np.random.seed(0)

## Analyse Dataset


### Movie lens Dataset
Reading dataset (MovieLens 1M movie ratings dataset: downloaded from https://grouplens.org/datasets/movielens/1m/)


In [210]:
import os
import zipfile
from os.path import exists
cwd = os.getcwd()
file_exists = exists('./ml-1m/movies.dat')
if(file_exists==False):
      
        print('downloading....')
        os.system('curl -o ml-1m.zip -SL https://files.grouplens.org/datasets/movielens/ml-1m.zip')
        print('download Complete')
        print('Extracting..')
        savePath=cwd
        savefile="./ml-1m.zip"
        with zipfile.ZipFile(savefile, 'r') as zip_ref:
            zip_ref.extractall(savePath)
        print('Complete')

In [211]:

movie_df = pd.io.parsers.read_csv('ml-1m/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::',encoding = "ISO-8859-1")


In [212]:
rating_df = pd.io.parsers.read_csv('ml-1m/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')


In [213]:
movie_ratingCount = (rating_df.
     groupby(by = ['movie_id'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['movie_id', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,movie_id,totalRatingCount
0,1,2077
1,2,701
2,3,478
3,4,170
4,5,296


In [214]:
rating_df_totalRatingCount = rating_df.merge(movie_ratingCount, left_on = 'movie_id', right_on = 'movie_id', how = 'left')
rating_df_totalRatingCount.head()

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703


In [215]:

data= rating_df_totalRatingCount.query('totalRatingCount >= 200')
data.head()

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703


In [216]:
rating_df=data

## Final dataset

In [217]:
rating_df.head(1)

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount
0,1,1193,5,978300760,1725


In [218]:
graph_df=rating_df[['user_id','movie_id','rating']]
graph_df.columns = ['source', 'target', 'weights']

In [219]:
movie_df.head(1)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy


In [220]:
!pip install networkx
!pip install stellargraph
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Graph Embeddings


###  Step1: Create the rating matrix with rows as movies and columns as users.

In [221]:
import networkx as nx

### Create a weighted graph of user Item using python library networkx, stellargraph
- https://snap.stanford.edu/node2vec/
- https://github.com/aditya-grover/node2vec

- https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147
- https://towardsdatascience.com/node2vec-explained-db86a319e9ab
- https://github.com/stellargraph/stellargraph

In [222]:
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph

In [223]:
G = StellarGraph(edges=graph_df)

In [224]:
rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=80,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 60400


#### compute random walk

In [225]:

from gensim.models import Word2Vec

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=1)

#### Save model and use for recommendation

In [226]:
model.save("word2vec.model")

In [227]:
model = Word2Vec.load("word2vec.model")

In [228]:
node_embeddings= model.wv

In [229]:
import networkx as nx

In [230]:
user_item_edge_list = rating_df[['user_id', 'movie_id', 'rating']]
user_item_edge_list.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


#### Create an user movie dictionary

In [231]:
user2dict = dict()
movie2dict = dict()
cnt = 0
for x in user_item_edge_list.values:
    usr = (x[0], 'user')
    movie = (x[1], 'movie')
    if usr in user2dict:
        pass
    else:
        user2dict[usr] = cnt
        cnt += 1
    if movie in movie2dict:
        pass
    else:
        movie2dict[movie] = cnt
        cnt += 1

Create a user-movie weighted graph using python library networkx. 

In [232]:
movie_df[movie_df['movie_id']==260]

Unnamed: 0,movie_id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi


In [233]:
movie_df[movie_df['movie_id']==1210]

Unnamed: 0,movie_id,title,genre
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War


In [234]:
movie1 = str(movie2dict[(260, 'movie')])
movie2 = str(movie2dict[(1196, 'movie')])
movie3 = str(movie2dict[(1210, 'movie')])

#### Check cosine similarity

In [235]:
from scipy.spatial.distance import cosine
1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])

0.633262038230896

Since we worked with integer ids for nodes, let's create reverse mapping dictionaries that map integer user/movie to their actual ids. 

In [236]:
reverse_movie2dict = {k:v for v,k in movie2dict.items()}
reverse_user2dict = {k:v for v,k in user2dict.items()}

In [237]:
from sklearn.metrics.pairwise import cosine_similarity


In [238]:
df=rating_df['movie_id'].unique()

In [239]:
df=list(df)
df.sort()


#### Make prediction for movie ***start wars***

In [240]:
movie1 = str(movie2dict[(260, 'movie')])


In [241]:
result= {}
for i in df:
  if(i!=260):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    result[i]=cos

In [242]:
from collections import Counter
result=dict(Counter(result).most_common(10))

In [243]:

result= list(result.keys())

In [244]:
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
768,778,Trainspotting (1996),Drama
999,1012,Old Yeller (1957),Children's|Drama
1943,2012,Back to the Future Part III (1990),Comedy|Sci-Fi|Western
2045,2114,"Outsiders, The (1983)",Drama
2052,2121,Cujo (1983),Horror|Thriller
2303,2372,Fletch Lives (1989),Comedy
2305,2374,Gung Ho (1986),Comedy|Drama
2650,2719,"Haunting, The (1999)",Horror|Thriller
2849,2918,Ferris Bueller's Day Off (1986),Comedy
3432,3501,Murphy's Romance (1985),Comedy|Romance


### Recommendations are not good

## Combine the user-movie and movie-genre graph

In [245]:
movie_genre_edgelist = movie_df[['movie_id', 'genre']]
movie_genre_edgelist.head()

Unnamed: 0,movie_id,genre
0,1,Animation|Children's|Comedy
1,2,Adventure|Children's|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama
4,5,Comedy


In [246]:
genre2int = dict()
for x in movie_genre_edgelist.values:
    genres = x[1].split('|')
    for genre in genres:
        if genre in genre2int:
            pass
        else:
            genre2int[genre] = cnt
            cnt += 1

In [247]:
genre2int

{'Action': 7473,
 'Adventure': 7469,
 'Animation': 7466,
 "Children's": 7467,
 'Comedy': 7468,
 'Crime': 7474,
 'Documentary': 7478,
 'Drama': 7472,
 'Fantasy': 7470,
 'Film-Noir': 7482,
 'Horror': 7476,
 'Musical': 7480,
 'Mystery': 7481,
 'Romance': 7471,
 'Sci-Fi': 7477,
 'Thriller': 7475,
 'War': 7479,
 'Western': 7483}

In [248]:
movie_genre_graph = nx.Graph()
for x in movie_genre_edgelist.values:
    movie = (x[0], 'movie')
    genres = x[1].split('|')
    if movie in movie2dict:
        for genre in genres:
            if genre in genre2int:
                movie_genre_graph.add_node(movie2dict[movie])
                movie_genre_graph.add_node(genre2int[genre])
                movie_genre_graph.add_edge(movie2dict[movie], genre2int[genre], weight=1.0)
            else:
                pass

In [249]:
user_movie_genre_graph =  nx.Graph()
user_movie_genre_graph.add_weighted_edges_from([(x,y,user_movie_graph[x][y]['weight']) for x,y in user_movie_graph.edges()])
user_movie_genre_graph.add_weighted_edges_from([(x,y,movie_genre_graph[x][y]['weight']) for x,y in movie_genre_graph.edges()])

In [250]:
G = StellarGraph(user_movie_genre_graph)

***Alert:**** Following  operation is time consuming

In [251]:
rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=80,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 74840


#### compute random walk

In [252]:

from gensim.models import Word2Vec

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=1)

In [253]:
model.save("word2vec1.model")

In [254]:
node_embeddings= model.wv

#### Prediction for ***Star wars***

In [258]:
movie_df[movie_df['movie_id']==260]

Unnamed: 0,movie_id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi


In [266]:
result= {}
movie1 = str(movie2dict[(260, 'movie')])
for i in df:
  if(i!=260):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    result[i]=cos
from collections import Counter
result=dict(Counter(result).most_common(10))
result= list(result.keys())
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
537,541,Blade Runner (1982),Film-Noir|Sci-Fi
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure
1196,1214,Alien (1979),Action|Horror|Sci-Fi|Thriller
1220,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1271,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
1366,1387,Jaws (1975),Action|Horror
2460,2529,Planet of the Apes (1968),Action|Sci-Fi
2559,2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi


In [264]:
movie_df[movie_df['movie_id']==2409] 

Unnamed: 0,movie_id,title,genre
2340,2409,Rocky II (1979),Action|Drama


In [269]:

result= {}
movie1 = str(movie2dict[(2409, 'movie')])
for i in df:
  if(i!=2409):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    result[i]=cos
from collections import Counter
result=dict(Counter(result).most_common(10))
result= list(result.keys())
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
2307,2376,"View to a Kill, A (1985)",Action
2333,2402,Rambo: First Blood Part II (1985),Action|War
2334,2403,First Blood (1982),Action
2335,2404,Rambo III (1988),Action|War
2341,2410,Rocky III (1982),Action|Drama
2342,2411,Rocky IV (1985),Action|Drama
2746,2815,Iron Eagle (1986),Action|War
3128,3197,"Presidio, The (1988)",Action
3372,3441,Red Dawn (1984),Action|War
3375,3444,Bloodsport (1988),Action


In [270]:
movie_df[movie_df['movie_id']==10] 

Unnamed: 0,movie_id,title,genre
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [271]:

result= {}
movie1 = str(movie2dict[(10, 'movie')])
for i in df:
  if(i!=10):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    result[i]=cos
from collections import Counter
result=dict(Counter(result).most_common(10))
result= list(result.keys())
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
345,349,Clear and Present Danger (1994),Action|Adventure|Thriller
724,733,"Rock, The (1996)",Action|Adventure|Thriller
1035,1049,"Ghost and the Darkness, The (1996)",Action|Adventure
1349,1370,Die Hard 2 (1990),Action|Thriller
1513,1552,Con Air (1997),Action|Adventure|Thriller
1673,1722,Tomorrow Never Dies (1997),Action|Romance|Thriller
2204,2273,Rush Hour (1998),Action|Thriller
2284,2353,Enemy of the State (1998),Action|Thriller
3013,3082,"World Is Not Enough, The (1999)",Action|Thriller
3187,3256,Patriot Games (1992),Action|Thriller
