In [1]:
import numpy as np
import pandas as pd

In [11]:
import heapq
from collections import defaultdict
from operator import itemgetter

In [4]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [5]:
ratings = ratings.groupby('userId' , group_keys=False ).apply( lambda x : x.sample(frac=.5) )

In [6]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
196,1,2993,5.0,964982242
159,1,2470,5.0,964982588


In [7]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [8]:
print( ratings.shape )
print( movies.shape )

(50420, 4)
(9742, 3)


In [9]:
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

from surprise import Dataset ,Reader 

from surprise.model_selection import train_test_split , LeaveOneOut


In [10]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [74]:
testSubject = 60 
k = 5
trainSet = data.build_full_trainset()

In [75]:
sim_options = {'name': 'pearson',
               'user_based': True
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [76]:
testUserInnerID = trainSet.to_inner_uid(testSubject)
similarityRow = simsMatrix[testUserInnerID]

In [78]:
similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append( (innerID, score) )

In [79]:
kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

In [85]:
candidates = defaultdict(float)
for similarUser in kNeighbors:
    
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]

    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

In [87]:
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

In [88]:
movie_name = movies[['movieId','title']].set_index('movieId').to_dict()['title']

In [91]:
pos = 0
print( 'Top 10 based on User Similarity : \n')
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movie_name[movieID] , ratingSum)
        pos += 1
        if (pos >= k):
            break

Top 10 based on User Similarity : 

Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 2.5
Matrix, The (1999) 2.4000000000000004
Indiana Jones and the Last Crusade (1989) 2.4
Dave (1993) 2.3
Ghostbusters (a.k.a. Ghost Busters) (1984) 2.3
