In [1]:
import pandas as pd
import numpy as np
import math

Load the datasets

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

Calculating the mean rating and subtracting from each rating of a user to calculate the adjusted rating

In [3]:
mean = ratings.groupby(
    ['movieId'], as_index=False, sort=False
).mean().rename(
    columns={'rating': 'rating_mean'}
)[['movieId', 'rating_mean']]

In [4]:
ratings = pd.merge(ratings, mean, on='movieId', how='left', sort=False)
ratings['rating_adjusted'] = ratings['rating'] - ratings['rating_mean']

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_mean,rating_adjusted
0,12882,1,4.0,1147195252,3.793347,0.206653
1,12882,32,3.5,1147195307,3.970519,-0.470519
2,12882,47,5.0,1147195343,4.009956,0.990044
3,12882,50,5.0,1147185499,4.180617,0.819383
4,12882,110,4.5,1147195239,3.733753,0.766247


Calculating the similarity value for each movie user has not rated to movies user has rated and selecting 20 most similar movies . Please note that or testing purpose, i have calculated the similarity values or only one user. Add one more loop to calculate it for all the users.

In [6]:
movie_data_all_append = pd.DataFrame()
user_data = ratings[ratings['userId'] !=  320] # targeting only one user
distinct_movies = np.unique(user_data['movieId'])
i = 0
for movie in distinct_movies[:2]:
    if i%10==0:
        print('{i} out of {movie}'.format(i=i, movie=len(distinct_movies)))
    movie_data_all = pd.DataFrame()
    movie_data = ratings[ratings['movieId'] == movie]
    movie_data = movie_data[['userId', 'movieId', 'rating_adjusted']].drop_duplicates()
    movie_data = movie_data.rename(columns={'rating_adjusted':'rating_adjusted1'})
    movie_data = movie_data.rename(columns={'movieId': 'movieId1'})
    movie1_val = np.sqrt(np.sum(np.square(movie_data['rating_adjusted1']), axis=0))
    
    user_data1 = ratings[ratings['userId'] == 320]
    distinct_movies1 = np.unique(user_data1['movieId'])
    
    for movie1 in distinct_movies1:
        movie_data1 = ratings[ratings['movieId'] == movie1]
        movie_data1 = movie_data1[['userId', 'movieId', 'rating_adjusted']].drop_duplicates()
        movie_data1 = movie_data1.rename(columns={'rating_adjusted':'rating_adjusted2'})
        movie_data1 = movie_data1.rename(columns={'movieId': 'movieId2'})
        movie2_val = np.sqrt(np.sum(np.square(movie_data1['rating_adjusted2']), axis=0))
        
        movie_data_merge = pd.merge(movie_data, movie_data1[['userId', 'movieId2', 'rating_adjusted2']],
                                    on='userId', how='inner', sort=False)        
        movie_data_merge['vector_product'] = (movie_data_merge['rating_adjusted1'] * movie_data_merge['rating_adjusted2'])
        movie_data_merge = movie_data_merge.groupby(['movieId1', 'movieId2'], as_index=False, sort=False).sum()
        movie_data_merge['dot'] = movie_data_merge['vector_product']/(movie1_val*movie2_val)
        movie_data_merge = movie_data_merge.sort_values('dot', ascending=False)
        movie_data_all = movie_data_all.append(movie_data_merge, ignore_index=True)
        
    movie_data_all = movie_data_all[movie_data_all['dot'] < 1]
    movie_data_all = movie_data_all.sort_values('dot', ascending=False)
    movie_data_all = movie_data_all.head(3)
    movie_data_all_append = movie_data_all_append.append(movie_data_all, ignore_index=True)
    
    i = i+1

0 out of 2500


In [7]:
movie_data_all_append

Unnamed: 0,movieId1,movieId2,userId,rating_adjusted1,rating_adjusted2,vector_product,dot
0,1,8961,23972864,36.035282,12.607143,126.944592,0.371755
1,1,1196,25563902,-2.831653,24.672549,115.704544,0.286217
2,1,260,27037802,2.388105,33.280374,113.688244,0.278933
3,2,1917,12702967,-10.569892,14.892966,94.336005,0.331899
4,2,2012,12794049,2.139785,-4.658537,73.456694,0.29142
5,2,2011,13045266,-2.209677,4.47561,63.516457,0.258061


## Graph databases

In [8]:
from py2neo import Graph
graph = Graph()

In [9]:
graph.delete_all()

In [10]:
all_movies = set(list(movie_data_all_append['movieId1'].values) + list(movie_data_all_append['movieId2'].values))

In [11]:
list(movie_data_all_append['movieId1'].values)

[1, 1, 1, 2, 2, 2]

In [12]:
all_movies

{1, 2, 260, 1196, 1917, 2011, 2012, 8961}

Creating the nodes

In [13]:
from py2neo import Node
for movie in all_movies:
    movie_node = Node("Movie", name=str(movie))
    graph.create(movie_node)

Now creating the connections

In [14]:
for index, row in movie_data_all_append.iterrows():
    query = 'MATCH (m:Movie {name:"%s"}), (n:Movie {name:"%s"}) CREATE (m)-[:SIMILAR_TO]->(n)' % (
        str(int(row['movieId1'])), str(int(row['movieId2'])))
    print(query)
    data = graph.run(query)

MATCH (m:Movie {name:"1"}), (n:Movie {name:"8961"}) CREATE (m)-[:SIMILAR_TO]->(n)
MATCH (m:Movie {name:"1"}), (n:Movie {name:"1196"}) CREATE (m)-[:SIMILAR_TO]->(n)
MATCH (m:Movie {name:"1"}), (n:Movie {name:"260"}) CREATE (m)-[:SIMILAR_TO]->(n)
MATCH (m:Movie {name:"2"}), (n:Movie {name:"1917"}) CREATE (m)-[:SIMILAR_TO]->(n)
MATCH (m:Movie {name:"2"}), (n:Movie {name:"2012"}) CREATE (m)-[:SIMILAR_TO]->(n)
MATCH (m:Movie {name:"2"}), (n:Movie {name:"2011"}) CREATE (m)-[:SIMILAR_TO]->(n)


All the steps before this are offline steps. Now you do the online step of just querying on the database.

In [15]:
%load_ext cypher

In [16]:
%%time
results = %cypher MATCH (m:Movie {name: "1"})-[:SIMILAR_TO]->(n) RETURN n.name AS name
df = results.get_dataframe()
print(df)

3 rows affected.
   name
0   260
1  1196
2  8961
CPU times: user 12.2 ms, sys: 4.09 ms, total: 16.2 ms
Wall time: 205 ms


References

* https://medium.com/@tomar.ankur287/item-item-collaborative-filtering-recommender-system-in-python-cf3c945fae1e
* http://nicolewhite.github.io/neo4j-jupyter/hello-world.html