# Graph Embeddings for Recommendation System

In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse.linalg import svds, norm
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import operator
from collections import defaultdict

## Data 

We will use the MovieLens data which contains ratings of movies by users. The data which is publically available from [MovieLens Website](https://grouplens.org/datasets/movielens/). We are using 100k data which has 100k ratings. 

In [4]:
data_path = '../data/'

In [5]:
rating_df = pd.read_csv(data_path + 'ratings.csv', sep=',', header=0)

In [6]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
rating_df.shape

(100836, 4)

In [8]:
rating_df.userId.nunique()

610

In [9]:
rating_df.movieId.nunique()

9724

- The data has over 100k ratings by 610 users on 9724 movies

In [10]:
max(rating_df.userId), max(rating_df.movieId)

(610, 193609)

  - The movie ids do not follow an order. 
  - 9724 movies have been selected that users with id 1 to 610 have rated so as to have 100k ratings. 

### Movies Information

We are also provided the titles and genres of the movies in a separate file. 

In [11]:
movie_df = pd.read_csv(data_path + 'movies.csv', sep=',', header=0)

In [12]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
movie_df[movie_df['title'].str.contains('Avengers')]

Unnamed: 0,movieId,title,genres
1611,2153,"Avengers, The (1998)",Action|Adventure
6148,44020,Ultimate Avengers (2006),Action|Animation|Children|Sci-Fi
7693,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
8551,115727,Crippled Avengers (Can que) (Return of the 5 D...,Action|Adventure
8686,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi
8693,122912,Avengers: Infinity War - Part I (2018),Action|Adventure|Sci-Fi
9153,147657,Masked Avengers (1981),Action
9488,170297,Ultimate Avengers 2 (2006),Action|Animation|Sci-Fi


## Matrix Factorization

For the sake of completion and comparision, we first describe a recommendation system based on matrix factorization. 

A very popular technique for recommendation systems. We factorize the user-item matrix to obtain the user factors and item factors which are the low-dimensional embeddings such that 'similar' user/items are mapped to 'nearby' points. Moreover, the user and the movies are embedded to the same space, which provides a way to compute user-movie similarity.  

Create a matrix of ratings

In [14]:
ratings_mat = np.ndarray(
    shape=(np.max(rating_df.movieId.values), np.max(rating_df.userId.values)),
    dtype=np.uint8)
ratings_mat[rating_df.movieId.values-1, rating_df.userId.values-1] = rating_df.rating.values

In [15]:
ratings_mat.shape

(193609, 610)

Normalize the rating matrix

In [16]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

We will use Singular Value Decomposition (SVD) for factorizing the matrix. Since the user-movie rating matrix is very sparse, it is more efficient to use the implementation from scipy.sparse. 

The number of the latent-factors is chosen to be 50 i.e. top-50 singular values of the SVD are considered. 

In [17]:
n_factors = 50

In [18]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = svds(A, n_factors)

In [19]:
U.shape

(610, 50)

In [20]:
V.shape

(50, 193609)

In [21]:
movie_factors = V.T
user_factors = U

Let's study some examples to have a qualitative understanding. Cosine similarity of the latent factors of two movies signifies how similar the movies are.

In [22]:
idx = 260
movie_df[movie_df.movieId == idx].title.values[0],  movie_df[movie_df.movieId == 1196].title.values[0]

('Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)')

In [23]:
1.0 - cosine(movie_factors[259], movie_factors[1195])

0.8777832979913568

In [24]:
movie_df[movie_df.movieId == 1210].title.values[0], 1.0 - cosine(movie_factors[259], movie_factors[1209])

('Star Wars: Episode VI - Return of the Jedi (1983)', 0.8518636866885215)

In [25]:
movie_df[movie_df.movieId == 1].title.values[0], 1.0 - cosine(movie_factors[259], movie_factors[0])

('Toy Story (1995)', 0.20152844265131886)

The similarity of the 'Star Wars: Episode IV - A New Hope' is higher for the movies 'Star Wars: Episode V - The Empire Strikes Back' and 'Star Wars: Episode VI - Return of the Jedi' and is much lower for 'Toy Story'. Moreover, the 'Star Wars: Episode VI' is closer to 'Star Wars: Episode IV' than the 'Star Wars: Episode V'.  

Function to get top-n movies similar to a given movie. 

In [26]:
def get_similar_movies_matrix_factorization(data, movieid, top_n=10):
    index = movieid - 1 # Movie id starts from 1
    movie = movie_df[movie_df.movieId == movieid].title.values[0]
    movie_row = data[index].reshape(1,-1)
    similarity = cosine_similarity(movie_row, data)
    sort_indexes = np.argsort(-similarity)[0]
    return {'movie': movie, 'sim_movies': [movie_df[movie_df.movieId == id].title.values[0] for id in sort_indexes[:top_n] + 1]}

In [27]:
movie_id = 260
get_similar_movies_matrix_factorization(movie_factors, movie_id)

{'movie': 'Star Wars: Episode IV - A New Hope (1977)',
 'sim_movies': ['Star Wars: Episode IV - A New Hope (1977)',
  'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'Star Wars: Episode VI - Return of the Jedi (1983)',
  'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  'Lesson Faust (1994)',
  'Touch (1997)',
  'Inferno (2016)',
  'Beverly Hills Chihuahua (2008)',
  'Matrix, The (1999)',
  'Star Wars: Episode III - Revenge of the Sith (2005)']}

In [28]:
movie_id = 1
get_similar_movies_matrix_factorization(movie_factors, movie_id)

{'movie': 'Toy Story (1995)',
 'sim_movies': ['Toy Story (1995)',
  'Toy Story 2 (1999)',
  'Adventures of Pinocchio, The (1996)',
  'Eddie (1996)',
  'Children of the Corn IV: The Gathering (1996)',
  'Twister (1996)',
  'Sudden Death (1995)',
  'Dear God (1996)',
  'Kazaam (1996)',
  'Sunset Park (1996)']}

In [29]:
user_factors.shape, movie_factors.shape

((610, 50), (193609, 50))

Since the user and movies are in the same space, we can also compute movies similar to a user. A recommendation model can be defined as showing movies similar to the given user.  

In [30]:
def get_recommendations_matrix_factorization(userid, user_factors, movie_factors, top_n=5):
    user_vec = user_factors[userid - 1].reshape(1,-1)
    similarity = cosine_similarity(user_vec, movie_factors)
    sort_indexes = np.argsort(-similarity)[0]
    return [movie_df[movie_df.movieId == id].title.values[0] for id in sort_indexes[:top_n] + 1]   

In [31]:
top_recos = get_recommendations_matrix_factorization(1, user_factors, movie_factors)
top_recos

['Best Men (1997)',
 "Gulliver's Travels (1939)",
 'Newton Boys, The (1998)',
 'Teenage Mutant Ninja Turtles III (1993)',
 'Welcome to Woop-Woop (1997)']

## Graph Embeddings

Create a user-movie graph with edge weights as the ratings. We will use [DeepWalk](https://arxiv.org/abs/1403.6652) to embed every node of the graph to a low-dimensional space. 

In [32]:
import networkx as nx

In [33]:
user_item_edgelist = rating_df[['userId', 'movieId', 'rating']]
user_item_edgelist.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [34]:
user_item_edgelist.shape

(100836, 3)

Since userids and movieids both start from 1, and thus same id can correspond to a user and a movie. We will map the ids to unique integers. 

In [35]:
user2dict = dict()
movie2dict = dict()
cnt = 0
for x in user_item_edgelist.values:
    usr = (x[0], 'user')
    movie = (x[1], 'movie')
    if usr in user2dict:
        pass
    else:
        user2dict[usr] = cnt
        cnt += 1
    if movie in movie2dict:
        pass
    else:
        movie2dict[movie] = cnt
        cnt += 1

In [36]:
len(user2dict), len(movie2dict)

(610, 9724)

In [37]:
len(user2dict) + len(movie2dict)

10334

Create a user-movie weighted graph using python library networkx. 

In [38]:
user_movie_graph = nx.Graph()

In [39]:
for x in user_item_edgelist.values:
    usr = (x[0], 'user')
    movie = (x[1], 'movie')
    user_movie_graph.add_node(user2dict[usr])
    user_movie_graph.add_node(movie2dict[movie])
    user_movie_graph.add_edge(user2dict[usr], movie2dict[movie], weight=float(x[2]))

In [40]:
user_movie_graph.number_of_edges()

100836

In [41]:
user_movie_graph.number_of_nodes()

10334

### DeepWalk

We will use the implementation of DeepWalk provided in node2vec which is a bit different from original DeepWalk e.g. it uses negative sampling whereas the original DeepWalk paper used hierarchical sampling for the skip-gram model. 

To create embeddings from the context and non-context pairs, we are using Gensim python library. One can easily use Google word2vec or Facebook fasttext for this task. 

In [52]:
import node2vec 
from gensim.models import Word2Vec

In [134]:
G = node2vec.Graph(user_movie_graph, is_directed=False, p=1, q=1)
# p,q = 1 for DeeWalk as the random walks are completely unbiased. 

In [47]:
# Compute the transition probabilities based on the edge weights. 
G.preprocess_transition_probs()

Compute the random walks. 
  - 10 walks for every node.
  - Each walk of length 80. 

In [48]:
walks = G.simulate_walks(num_walks=10, walk_length=80)

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [132]:
len(walks)

103340

In [133]:
walks[0]

[9954,
 9950,
 1736,
 6390,
 181,
 9294,
 21,
 9458,
 510,
 486,
 711,
 5797,
 362,
 8105,
 8107,
 8105,
 6915,
 7634,
 6538,
 7504,
 7511,
 7504,
 84,
 9229,
 232,
 8614,
 163,
 6651,
 823,
 5370,
 3764,
 10031,
 2547,
 4361,
 4363,
 6794,
 258,
 8115,
 206,
 8519,
 404,
 9509,
 405,
 9950,
 2104,
 9950,
 3284,
 9950,
 2717,
 2707,
 425,
 9940,
 116,
 4142,
 466,
 9070,
 2704,
 10169,
 5773,
 9588,
 1066,
 9175,
 5,
 7024,
 7026,
 8148,
 819,
 7378,
 236,
 9335,
 204,
 5031,
 646,
 5031,
 825,
 6395,
 86,
 5460,
 781,
 8538]

Learn Embeddings via Gensim, which creates context/non-context pairs and then Skip-gram. 

In [131]:
def learn_embeddings(walks):
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    Uses Gensim Word2Vec.
    '''
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, size=50, window=10, min_count=0, sg=1, workers=8, iter=1)
    return model.wv


In [62]:
node_embeddings = learn_embeddings(walks)

The output of gensim is a specific type of key-value pair with keys as the string-ed node ids and the values are numpy array of embeddings, each of shape (50,)

In [63]:
node_embeddings['0']

array([-0.17863706,  0.35657546,  0.39951935, -0.24555558, -0.14919154,
       -0.40887597,  0.35577917,  0.16313212,  0.00252161,  0.16966167,
       -0.3765413 , -0.21337669, -0.0646629 ,  0.19614454,  0.01291097,
       -0.3814006 , -0.2582114 ,  0.16001315,  0.29271924, -0.5413573 ,
       -0.6119701 , -0.11067525, -0.2872824 , -0.14502008, -0.0231966 ,
        0.3855987 ,  0.21319212,  0.35930753,  0.5704927 , -0.533855  ,
       -0.3821231 , -0.11361189, -0.447679  ,  0.26765013,  0.18888068,
        0.6884152 , -0.16910248,  0.22091031,  0.04694226, -0.48486325,
       -0.29224885,  0.4899701 ,  0.3893704 , -0.19709189,  0.12075771,
        0.45358083,  0.53285736,  0.29594794, -0.03613487, -0.34110054],
      dtype=float32)

In [68]:
node_embeddings['0'].shape

(50,)

Let's look at the same examples that we used to qualitatively investigate the matrix factorization based embeddings.
- 260 = Star Wars IV
- 1196 = Star Wars V
- 1210 = Star Wars VI
- 1 = Toy Stroy

In [70]:
movie1 = str(movie2dict[(260, 'movie')])
movie2 = str(movie2dict[(1196, 'movie')])
1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])

0.9207762479782104

In [71]:
movie3 = str(movie2dict[(1210, 'movie')])
1.0 - cosine(node_embeddings[movie1], node_embeddings[movie3])

0.9118576049804688

In [72]:
movie4 = str(movie2dict[(1, 'movie')])
1.0 - cosine(node_embeddings[movie1], node_embeddings[movie4])

0.713432788848877

Since we worked with integer ids for nodes, let's create reverse mapping dictionaries that map integer user/movie to their actual ids. 

In [76]:
reverse_movie2dict = {k:v for v,k in movie2dict.items()}
reverse_user2dict = {k:v for v,k in user2dict.items()}

In [113]:
node_vecs = [node_embeddings[str(i)] for i in range(cnt)]
node_vecs = np.array(node_vecs)
node_vecs.shape

(10334, 50)

Movies similar to a given movie as an evaluation of the system. 

In [114]:
def get_similar_movies_graph_embeddings(movieid, movie_embed, top_n=10):
    movie_idx = movie2dict[movieid]
    query = movie_embed[movie_idx].reshape(1,-1)
    ranking = cosine_similarity(query, movie_embed)
    top_ids = np.argsort(-ranking)[0]
    top_movie_ids = [reverse_movie2dict[j] for j in top_ids if j in reverse_movie2dict][:top_n]
    sim_movies = [movie_df[movie_df.movieId == id[0]].title.values[0] for id in top_movie_ids]
    return sim_movies

In [117]:
get_similar_movies_graph_embeddings((260, 'movie'), node_vecs)[:10]

['Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Frankie and Johnny (1966)',
 'Matrix, The (1999)',
 'Indiana Jones and the Last Crusade (1989)',
 'Princess Bride, The (1987)',
 'Venom (1982)',
 'Back to the Future (1985)']

In [118]:
get_similar_movies_graph_embeddings((122912, 'movie'), node_vecs)[:10]

['Avengers: Infinity War - Part I (2018)',
 'Thor: Ragnarok (2017)',
 'Deadpool 2 (2018)',
 'Guardians of the Galaxy 2 (2017)',
 'Mission: Impossible - Fallout (2018)',
 'Justice League (2017)',
 'Untitled Spider-Man Reboot (2017)',
 'Arrival (2016)',
 'A Silent Voice (2016)',
 'Logan (2017)']

In [119]:
get_similar_movies_graph_embeddings((1, 'movie'), node_vecs)[:10]

['Toy Story (1995)',
 'Nutty Professor, The (1996)',
 'Twister (1996)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Adventures of Pinocchio, The (1996)',
 'Broken Arrow (1996)',
 'Once Upon a Time... When We Were Colored (1995)',
 'Mission: Impossible (1996)',
 'Willy Wonka & the Chocolate Factory (1971)',
 'Babe (1995)']

Can also define the recommendation model based on the cosine similarity i.e the movies are ranked for a given user in terms of the cosine similarities of their corresponding embeddings with the embedding of the user. 

In [120]:
def get_recommended_movies_graph_embeddings(userid, node_embed, top_n=10):
    user_idx = user2dict[userid]
    query = node_embed[user_idx].reshape(1,-1)
    ranking = cosine_similarity(query, node_embed)
    top_ids = np.argsort(-ranking)[0]
    top_movie_ids = [reverse_movie2dict[j] for j in top_ids if j in reverse_movie2dict][:top_n]
    reco_movies = [movie_df[movie_df.movieId == id[0]].title.values[0] for id in top_movie_ids]
    return reco_movies

In [122]:
get_recommended_movies_graph_embeddings((1, 'user'), node_vecs, top_n=10)

['Best Men (1997)',
 'Newton Boys, The (1998)',
 "Gulliver's Travels (1939)",
 'Shaft (1971)',
 'Welcome to Woop-Woop (1997)',
 'Teenage Mutant Ninja Turtles III (1993)',
 'Quiet Man, The (1952)',
 "McHale's Navy (1997)",
 'Song of the South (1946)',
 'Howard the Duck (1986)']

As another evalution, let's compare the generated recommendation for a user to the movies tnat the user has actually rated highly. We will get top 10 recommendations for a user, ranked by the cosine similarity, and compute how many of these movies comes from the set of the movies that the user has rated >= 4.5

This tantamounts to Precision@10 metric. 

In [135]:
idx = 1
recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])
recos.intersection(true_pos)

{"Gulliver's Travels (1939)",
 'Newton Boys, The (1998)',
 'Quiet Man, The (1952)',
 'Shaft (1971)'}

For comparison, we will also compute the Precision for the recommendations produced by the matrix factorization model. 

In [138]:
mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
mf_recos.intersection(true_pos)

{'The Jinx: The Life and Deaths of Robert Durst (2015)'}

In [139]:
idx = 2
recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])
print(recos.intersection(true_pos))
mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(mf_recos.intersection(true_pos))

{'Town, The (2010)', 'The Jinx: The Life and Deaths of Robert Durst (2015)', 'Inglourious Basterds (2009)', 'Wolf of Wall Street, The (2013)', 'Warrior (2011)'}
{'The Jinx: The Life and Deaths of Robert Durst (2015)'}


In [140]:
idx = 3
recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])
print(recos.intersection(true_pos))
mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(mf_recos.intersection(true_pos))

{'Galaxy of Terror (Quest) (1981)', 'Saturn 3 (1980)', 'Death Race 2000 (1975)', 'The Lair of the White Worm (1988)', 'Hangar 18 (1980)', 'Looker (1981)', 'Android (1982)', 'Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)', 'Alien Contamination (1980)', 'Clonus Horror, The (1979)'}
{'Galaxy of Terror (Quest) (1981)', 'Looker (1981)', 'Alien Contamination (1980)', 'Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)'}


In [141]:
idx = 4
recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])
print(recos.intersection(true_pos))
mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(mf_recos.intersection(true_pos))

{'L.I.E. (2001)', "I'm the One That I Want (2000)"}
{'Beautiful Thing (1996)', "I'm the One That I Want (2000)"}


In [142]:
idx = 12
recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])
print(recos.intersection(true_pos))
mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(mf_recos.intersection(true_pos))

{'Little Women (1994)', 'Circle of Friends (1995)', "She's All That (1999)", 'Emma (1996)', '10 Things I Hate About You (1999)'}
set()


In [143]:
idx = 100
recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])
print(recos.intersection(true_pos))
mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(mf_recos.intersection(true_pos))

{'Importance of Being Earnest, The (2002)', 'Pillow Talk (1959)', 'Seven Brides for Seven Brothers (1954)', 'Far and Away (1992)', 'Officer and a Gentleman, An (1982)', 'Sweet Home Alabama (2002)', 'Working Girl (1988)', 'Father of the Bride (1950)'}
{'Dead Poets Society (1989)', 'Father of the Bride (1950)'}


## Enriched network with additional information : Genres

In [144]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Genres of the movies can be used as additional signal for better recommendations

In [145]:
movie_genre_edgelist = movie_df[['movieId', 'genres']]
movie_genre_edgelist.head()

Unnamed: 0,movieId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy


In [146]:
cnt

10334

In [147]:
genre2int = dict()
for x in movie_genre_edgelist.values:
    genres = x[1].split('|')
    for genre in genres:
        if genre in genre2int:
            pass
        else:
            genre2int[genre] = cnt
            cnt += 1

In [148]:
cnt

10354

In [149]:
genre2int

{'Adventure': 10334,
 'Animation': 10335,
 'Children': 10336,
 'Comedy': 10337,
 'Fantasy': 10338,
 'Romance': 10339,
 'Drama': 10340,
 'Action': 10341,
 'Crime': 10342,
 'Thriller': 10343,
 'Horror': 10344,
 'Mystery': 10345,
 'Sci-Fi': 10346,
 'War': 10347,
 'Musical': 10348,
 'Documentary': 10349,
 'IMAX': 10350,
 'Western': 10351,
 'Film-Noir': 10352,
 '(no genres listed)': 10353}

In [150]:
movie_genre_graph = nx.Graph()
for x in movie_genre_edgelist.values:
    movie = (x[0], 'movie')
    genres = x[1].split('|')
    if movie in movie2dict:
        for genre in genres:
            if genre in genre2int:
                movie_genre_graph.add_node(movie2dict[movie])
                movie_genre_graph.add_node(genre2int[genre])
                movie_genre_graph.add_edge(movie2dict[movie], genre2int[genre], weight=1.0)
            else:
                pass

In [151]:
movie_genre_graph.number_of_nodes()

9744

In [152]:
rating_df.movieId.nunique()

9724

In [153]:
movie_genre_graph.number_of_edges()

22046

In [154]:
list(movie_genre_graph.edges())[:5]

[(1, 10334), (1, 10335), (1, 10336), (1, 10337), (1, 10338)]

In [155]:
movie_genre_graph[1][10334]

{'weight': 1.0}

#### Combine the user-movie and movie-genre graph

In [156]:
user_movie_genre_graph =  nx.Graph()
user_movie_genre_graph.add_weighted_edges_from([(x,y,user_movie_graph[x][y]['weight']) for x,y in user_movie_graph.edges()])
user_movie_genre_graph.add_weighted_edges_from([(x,y,movie_genre_graph[x][y]['weight']) for x,y in movie_genre_graph.edges()])

In [157]:
user_movie_genre_graph.number_of_edges()

122882

In [158]:
list(user_movie_genre_graph.edges())[0]

(0, 1)

In [159]:
G_enriched = node2vec.Graph(user_movie_genre_graph, is_directed=False, p=1, q=1)

In [161]:
G_enriched.preprocess_transition_probs()

In [162]:
walks_enriched = G_enriched.simulate_walks(num_walks=10, walk_length=80)

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [163]:
node_embeddings_enriched = learn_embeddings(walks_enriched)

In [164]:
node_vecs_enriched = [node_embeddings_enriched[str(i)] for i in range(cnt)]
node_vecs_enriched = np.array(node_vecs_enriched)
node_vecs_enriched.shape

(10354, 50)

In [165]:
get_similar_movies_graph_embeddings((260, 'movie'), node_vecs_enriched)[:10]

['Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Back to the Future (1985)',
 'Matrix, The (1999)',
 'Die Hard (1988)',
 'Terminator, The (1984)',
 'Groundhog Day (1993)']

In [166]:
get_similar_movies_graph_embeddings((260, 'movie'), node_vecs)[:10]

['Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Frankie and Johnny (1966)',
 'Matrix, The (1999)',
 'Indiana Jones and the Last Crusade (1989)',
 'Princess Bride, The (1987)',
 'Venom (1982)',
 'Back to the Future (1985)']

In [170]:
1.0 - cosine(node_embeddings_enriched[movie1], node_embeddings_enriched[movie2])

0.9116637110710144

In [171]:
1.0 - cosine(node_embeddings_enriched[movie1], node_embeddings_enriched[movie3])

0.9081186652183533

In [172]:
1.0 - cosine(node_embeddings_enriched[movie1], node_embeddings_enriched[movie4])

0.7123555541038513

Recommendations

In [173]:
get_recommended_movies_graph_embeddings((1, 'user'), node_vecs_enriched, top_n=10)

['Newton Boys, The (1998)',
 'Best Men (1997)',
 "Gulliver's Travels (1939)",
 'Shaft (1971)',
 'Welcome to Woop-Woop (1997)',
 "Pete's Dragon (1977)",
 'Teenage Mutant Ninja Turtles III (1993)',
 'Quiet Man, The (1952)',
 'Lord of the Rings, The (1978)',
 'Rescuers, The (1977)']

In [174]:
get_recommended_movies_graph_embeddings((1, 'user'), node_vecs, top_n=10)

['Best Men (1997)',
 'Newton Boys, The (1998)',
 "Gulliver's Travels (1939)",
 'Shaft (1971)',
 'Welcome to Woop-Woop (1997)',
 'Teenage Mutant Ninja Turtles III (1993)',
 'Quiet Man, The (1952)',
 "McHale's Navy (1997)",
 'Song of the South (1946)',
 'Howard the Duck (1986)']

In [175]:
idx = 1
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])

mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(len(mf_recos.intersection(true_pos)))

ge_recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
print(len(ge_recos.intersection(true_pos)))

ge_enriched_reso = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs_enriched, top_n=10))
print(len(ge_enriched_reso.intersection(true_pos)))

2
4
6


In [176]:
idx = 2
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])

mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(len(mf_recos.intersection(true_pos)))

ge_recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
print(len(ge_recos.intersection(true_pos)))

ge_enriched_reso = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs_enriched, top_n=10))
print(len(ge_enriched_reso.intersection(true_pos)))

1
5
5


In [177]:
idx = 3
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])

mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(len(mf_recos.intersection(true_pos)))

ge_recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
print(len(ge_recos.intersection(true_pos)))

ge_enriched_reso = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs_enriched, top_n=10))
print(len(ge_enriched_reso.intersection(true_pos)))

4
10
10


In [178]:
idx = 4
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])

mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(len(mf_recos.intersection(true_pos)))

ge_recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
print(len(ge_recos.intersection(true_pos)))

ge_enriched_reso = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs_enriched, top_n=10))
print(len(ge_enriched_reso.intersection(true_pos)))

2
2
3


In [182]:
idx = 8
true_pos = set([movie_df[movie_df.movieId == id].title.values[0] for id in rating_df[(rating_df['userId'] == idx) & (rating_df['rating'] >= 4.5)].movieId.values])

mf_recos = set(get_recommendations_matrix_factorization(idx, user_factors, movie_factors))
print(len(mf_recos.intersection(true_pos)))

ge_recos = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs, top_n=10))
print(len(ge_recos.intersection(true_pos)))

ge_enriched_reso = set(get_recommended_movies_graph_embeddings((idx, 'user'), node_vecs_enriched, top_n=10))
print(len(ge_enriched_reso.intersection(true_pos)))

2
2
4
