In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD
import random
random.seed(0)
np.random.seed(0)

## Analyse Dataset


### Movie lens Dataset
Reading dataset (MovieLens 1M movie ratings dataset: downloaded from https://grouplens.org/datasets/movielens/1m/)


In [3]:
import os
import zipfile
from os.path import exists
cwd = os.getcwd()
file_exists = exists('./ml-1m/movies.dat')
if(file_exists==False):
      
        print('downloading....')
        os.system('curl -o ml-1m.zip -SL https://files.grouplens.org/datasets/movielens/ml-1m.zip')
        print('download Complete')
        print('Extracting..')
        savePath=cwd
        savefile="./ml-1m.zip"
        with zipfile.ZipFile(savefile, 'r') as zip_ref:
            zip_ref.extractall(savePath)
        print('Complete')

downloading....
download Complete
Extracting..
Complete


In [4]:

movie_df = pd.io.parsers.read_csv('ml-1m/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::',encoding = "ISO-8859-1")


In [5]:
rating_df = pd.io.parsers.read_csv('ml-1m/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')


#### Consider movies rated by more than 200 users

In [6]:
movie_ratingCount = (rating_df.
     groupby(by = ['movie_id'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['movie_id', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,movie_id,totalRatingCount
0,1,2077
1,2,701
2,3,478
3,4,170
4,5,296


In [7]:
rating_df_totalRatingCount = rating_df.merge(movie_ratingCount, left_on = 'movie_id', right_on = 'movie_id', how = 'left')
rating_df_totalRatingCount.head()

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703


In [8]:

data= rating_df_totalRatingCount.query('totalRatingCount >= 200')
data.head()

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703


In [9]:
rating_df=data

### Consider User rated more than 50 tumes

In [10]:
user_ratingCount = (rating_df.
     groupby(by = ['user_id'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalratingbyuser'})
     [['user_id', 'totalratingbyuser']]
    )
user_ratingCount.head(2)

Unnamed: 0,user_id,totalratingbyuser
0,1,50
1,2,122


In [11]:
data= user_ratingCount.query('totalratingbyuser >= 50')
data.head()

Unnamed: 0,user_id,totalratingbyuser
0,1,50
1,2,122
2,3,50
4,5,150
5,6,62


In [12]:
user_ratingCount = rating_df.merge( user_ratingCount, left_on = 'user_id', right_on = 'user_id', how = 'left')
user_ratingCount.head()

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount,totalratingbyuser
0,1,1193,5,978300760,1725,50
1,1,661,3,978302109,525,50
2,1,914,3,978301968,636,50
3,1,3408,4,978300275,1315,50
4,1,2355,5,978824291,1703,50


In [13]:
rating_df=user_ratingCount


### Prepare train and test set
- We will take 5 most recent movies watched by user as test set .And will consider other data for training our metrics

In [14]:
rating_df=rating_df.sort_values(by=['user_id','time'], ascending=[False,False]) 
rating_df.shape

(855730, 6)

In [15]:
test_df=rating_df.groupby('user_id').head(10)
test_df.shape

(60400, 6)

In [16]:
rating_df=rating_df[~rating_df.isin(test_df).all(1)]
rating_df.shape

(795330, 6)

In [17]:
graph_df=rating_df[['user_id','movie_id','rating','time']]
graph_df.columns = ['source', 'target', 'weights','time']

In [18]:
graph_df=graph_df[['source', 'target', 'weights']]

In [19]:
graph_df.shape

(795330, 3)

### Install packages

In [None]:
!pip install networkx
!pip install stellargraph
!pip install gensim

### Final data set

In [21]:
graph_df.head(1)

Unnamed: 0,source,target,weights
855652,6040,1333,4


In [22]:
rating_df.head(1)

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount,totalratingbyuser
855652,6040,1333,4,997454140,733,288


In [23]:
test_df.head(1)

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount,totalratingbyuser
855583,6040,1221,4,998315055,1692,288


In [24]:
movie_df.head(1)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy


## Graph Embeddings


###  Step1: Create the rating matrix with rows as movies and columns as users.

In [25]:
import networkx as nx

### Create a weighted graph of user Item using python library networkx, stellargraph
- https://snap.stanford.edu/node2vec/
- https://github.com/aditya-grover/node2vec

- https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147
- https://towardsdatascience.com/node2vec-explained-db86a319e9ab
- https://github.com/stellargraph/stellargraph

In [26]:
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph

In [27]:
G = StellarGraph(edges=graph_df)

In [28]:
rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=80,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 60390


#### compute random walk

In [29]:

from gensim.models import Word2Vec

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=1)

#### Save model and use for recommendation

In [30]:
model.save("word2vec.model")

In [31]:
model = Word2Vec.load("word2vec.model")

In [32]:
node_embeddings= model.wv

In [33]:
import networkx as nx

In [34]:
user_item_edge_list = graph_df[['source', 'target', 'weights']]
user_item_edge_list.head()

Unnamed: 0,source,target,weights
855652,6040,1333,4
855534,6040,2571,4
855576,6040,1211,5
855544,6040,1947,4
855614,6040,1270,3


#### Create an user movie dictionary

In [35]:
user2dict = dict()
movie2dict = dict()
cnt = 0
for x in user_item_edge_list.values:
    usr = (x[0], 'user')
    movie = (x[1], 'movie')
    if usr in user2dict:
        pass
    else:
        user2dict[usr] = cnt
        cnt += 1
    if movie in movie2dict:
        pass
    else:
        movie2dict[movie] = cnt
        cnt += 1

Create a user-movie weighted graph using python library networkx. 

In [36]:
movie_df[movie_df['movie_id']==260]

Unnamed: 0,movie_id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi


In [37]:
movie_df[movie_df['movie_id']==1210]

Unnamed: 0,movie_id,title,genre
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War


In [38]:
movie1 = str(movie2dict[(260, 'movie')])
movie2 = str(movie2dict[(1196, 'movie')])
movie3 = str(movie2dict[(1210, 'movie')])

#### Check cosine similarity

In [39]:
from scipy.spatial.distance import cosine
1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])

0.5796376466751099

Since we worked with integer ids for nodes, let's create reverse mapping dictionaries that map integer user/movie to their actual ids. 

In [40]:
reverse_movie2dict = {k:v for v,k in movie2dict.items()}
reverse_user2dict = {k:v for v,k in user2dict.items()}

In [41]:
from sklearn.metrics.pairwise import cosine_similarity


In [42]:
df=user_item_edge_list['target'].unique()

In [43]:
df=list(df)
df.sort()


#### Make prediction for movie ***start wars***

In [44]:
movie1 = str(movie2dict[(260, 'movie')])


In [45]:
result= {}
for i in df:
  if(i!=260):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=0
    try:
      cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    except:
      pass
    result[i]=cos

In [46]:
from collections import Counter
result=dict(Counter(result).most_common(10))

In [47]:

result= list(result.keys())

In [48]:
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
1337,1358,Sling Blade (1996),Drama|Thriller
1716,1772,Blues Brothers 2000 (1998),Action|Comedy|Musical
2051,2120,Needful Things (1993),Drama|Horror
2052,2121,Cujo (1983),Horror|Thriller
2269,2338,I Still Know What You Did Last Summer (1998),Horror|Mystery|Thriller
2746,2815,Iron Eagle (1986),Action|War
3179,3248,Sister Act 2: Back in the Habit (1993),Comedy
3297,3366,Where Eagles Dare (1969),Action|Adventure|War
3509,3578,Gladiator (2000),Action|Drama
3660,3729,Shaft (1971),Action|Crime


### Recommendations are not good

## Combine the user-movie and movie-genre graph

In [49]:
movie_genre_edgelist = movie_df[['movie_id', 'genre']]
movie_genre_edgelist.head()

Unnamed: 0,movie_id,genre
0,1,Animation|Children's|Comedy
1,2,Adventure|Children's|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama
4,5,Comedy


In [50]:
genre2int = dict()
for x in movie_genre_edgelist.values:
    genres = x[1].split('|')
    for genre in genres:
        if genre in genre2int:
            pass
        else:
            genre2int[genre] = cnt
            cnt += 1

In [51]:
genre2int

{'Action': 7471,
 'Adventure': 7467,
 'Animation': 7464,
 "Children's": 7465,
 'Comedy': 7466,
 'Crime': 7472,
 'Documentary': 7476,
 'Drama': 7470,
 'Fantasy': 7468,
 'Film-Noir': 7480,
 'Horror': 7474,
 'Musical': 7478,
 'Mystery': 7479,
 'Romance': 7469,
 'Sci-Fi': 7475,
 'Thriller': 7473,
 'War': 7477,
 'Western': 7481}

In [52]:
movie_genre_graph = nx.Graph()
for x in movie_genre_edgelist.values:
    movie = (x[0], 'movie')
    genres = x[1].split('|')
    if movie in movie2dict:
        for genre in genres:
            if genre in genre2int:
                movie_genre_graph.add_node(movie2dict[movie])
                movie_genre_graph.add_node(genre2int[genre])
                movie_genre_graph.add_edge(movie2dict[movie], genre2int[genre], weight=1.0)
            else:
                pass

In [53]:
user_movie_graph = nx.Graph()
for x in user_item_edge_list.values:

    usr = (x[0], 'user')
    movie = (x[1], 'movie')
    user_movie_graph.add_node(user2dict[usr])
    user_movie_graph.add_node(movie2dict[movie])
    user_movie_graph.add_edge(user2dict[usr], movie2dict[movie], weight=float(x[2]))

In [54]:
user_movie_graph

<networkx.classes.graph.Graph at 0x7fbcb72e6b50>

In [55]:
user_movie_genre_graph =  nx.Graph()
user_movie_genre_graph.add_weighted_edges_from([(x,y,user_movie_graph[x][y]['weight']) for x,y in user_movie_graph.edges()])
user_movie_genre_graph.add_weighted_edges_from([(x,y,movie_genre_graph[x][y]['weight']) for x,y in movie_genre_graph.edges()])

In [56]:
G = StellarGraph(user_movie_genre_graph)

 Following  operation is time consuming

In [57]:
rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=80,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 74820


#### compute random walk

In [58]:

from gensim.models import Word2Vec

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=1)

In [59]:
model.save("word2vec1.model")

In [60]:
node_embeddings= model.wv

#### Prediction for ***Star wars***

In [61]:
movie_df[movie_df['movie_id']==260]

Unnamed: 0,movie_id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi


In [62]:
result= {}
movie1 = str(movie2dict[(260, 'movie')])
for i in df:
  if(i!=260):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=0
    try:
      cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    except:
      pass
    result[i]=cos
from collections import Counter
result=dict(Counter(result).most_common(10))
result= list(result.keys())
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
740,750,Dr. Strangelove or: How I Learned to Stop Worr...,Sci-Fi|War
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure
1182,1200,Aliens (1986),Action|Sci-Fi|Thriller|War
1188,1206,"Clockwork Orange, A (1971)",Sci-Fi
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1196,1214,Alien (1979),Action|Horror|Sci-Fi|Thriller
1220,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
3402,3471,Close Encounters of the Third Kind (1977),Drama|Sci-Fi


In [63]:
movie_df[movie_df['movie_id']==2409] 

Unnamed: 0,movie_id,title,genre
2340,2409,Rocky II (1979),Action|Drama


In [64]:

result= {}
movie1 = str(movie2dict[(2409, 'movie')])
for i in df:
  if(i!=2409):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    result[i]=cos
from collections import Counter
result=dict(Counter(result).most_common(10))
result= list(result.keys())
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
2307,2376,"View to a Kill, A (1985)",Action
2333,2402,Rambo: First Blood Part II (1985),Action|War
2334,2403,First Blood (1982),Action
2335,2404,Rambo III (1988),Action|War
2341,2410,Rocky III (1982),Action|Drama
2342,2411,Rocky IV (1985),Action|Drama
2746,2815,Iron Eagle (1986),Action|War
2920,2989,For Your Eyes Only (1981),Action
2921,2990,Licence to Kill (1989),Action
3128,3197,"Presidio, The (1988)",Action


In [65]:
movie_df[movie_df['movie_id']==10] 

Unnamed: 0,movie_id,title,genre
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [66]:

result= {}
movie1 = str(movie2dict[(10, 'movie')])
for i in df:
  if(i!=10):
    movie2 = str(movie2dict[(i, 'movie')])
    cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
    result[i]=cos
from collections import Counter
result=dict(Counter(result).most_common(10))
result= list(result.keys())
movie_df[movie_df['movie_id'].isin(result)]

Unnamed: 0,movie_id,title,genre
93,95,Broken Arrow (1996),Action|Thriller
163,165,Die Hard: With a Vengeance (1995),Action|Thriller
206,208,Waterworld (1995),Action|Adventure
345,349,Clear and Present Danger (1994),Action|Adventure|Thriller
776,786,Eraser (1996),Action|Thriller
1035,1049,"Ghost and the Darkness, The (1996)",Action|Adventure
1405,1429,Jackie Chan's First Strike (1996),Action
1513,1552,Con Air (1997),Action|Adventure|Thriller
1673,1722,Tomorrow Never Dies (1997),Action|Romance|Thriller
2204,2273,Rush Hour (1998),Action|Thriller


## Evaluate our result
- We will generate top 10 recommendations for 100 user and calculate the hit rate in test data set.

In [89]:
userlist=list(graph_df.source.unique())[:100]

In [90]:
data=rating_df[rating_df['user_id'].isin(userlist)]

In [91]:
test_df=test_df[test_df['user_id'].isin(userlist)]

In [99]:
result=data.sort_values(by=['user_id','time'],ascending=[True,False])
result=result.groupby('user_id').head(15)

In [100]:
result.head(1)

Unnamed: 0,user_id,movie_id,rating,time,totalRatingCount,totalratingbyuser
841253,5941,908,3,957215161,1315,25


In [101]:
from collections import Counter
top_n = 10
k = 50
top_n = 10

recommend_dict = { }
for index, row in result.iterrows():
    user=row['user_id']
    if user not in recommend_dict.keys():
        recommend_dict[user]={}
    
    movieid=row['movie_id'] 
    movie1 = str(movie2dict[(movieid, 'movie')])
    res_dict= {}
    for i in df:
      if(i!=movieid):
        movie2 = str(movie2dict[(i, 'movie')])
        cos=1.0 - cosine(node_embeddings[movie1], node_embeddings[movie2])
        res_dict[i]=cos
    
    res=dict(Counter(res_dict).most_common(10))
    indexes= list(res.keys())    
    
  
    for ids in indexes:
        if(ids==movieid):
            continue
        if(~((result['movie_id'] == ids) & (result['user_id'] ==user)).any()):
            if ids not in recommend_dict[user].keys():
                recommend_dict[user][ids] =1
            else:
                recommend_dict[user][ids]+=1
                    
        
    
        
        

In [102]:
# Iterating over values
recommendations=[]
for userid, movies in recommend_dict.items():
    total=0
    mlist=[]
    for i in movies:
        total+=1
        mlist.append(i)
        if(total>15):
            break
    recommendations.append([userid,mlist])

In [103]:
recommendations=pd.DataFrame(recommendations,columns=['user_id','movies'])

In [104]:
recommend_6040=recommendations[recommendations.user_id==6040]
recommend_6040=list(recommend_6040.movies)[0]

### Calculate Hit Ratio
- See how often we recommended a movie the user actually rated

In [105]:
total=0
hit=0
for i in userlist:
    results=recommendations[recommendations.user_id==i]
    results=list(results.movies)[0]
    total+=1
    for rs in results:
        
        if(((test_df['movie_id'] == rs) & (test_df['user_id'] ==i)).any()):
                hit+=1
                break
        
hit_ratio=hit/total   
print(hit_ratio)

0.55
