In [2]:
import numpy as np
import pandas as pd

In [3]:
import heapq
from collections import defaultdict
from operator import itemgetter

In [4]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [5]:
ratings = ratings.groupby('userId' , group_keys=False ).apply( lambda x : x.sample(frac=.5) )

In [6]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
83,1,1258,3.0,964983414
158,1,2459,5.0,964983414


In [10]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [12]:
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

from surprise import Dataset ,Reader 

from surprise.model_selection import train_test_split , LeaveOneOut


In [13]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [58]:
testSubject = 60 
k = 5
trainSet = data.build_full_trainset()

In [59]:
sim_options = {'name': 'pearson',
               'user_based': False
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [60]:
testUserInnerID = trainSet.to_inner_uid(testSubject)


In [61]:
# Get the top K items we rated
testUserRatings = trainSet.ur[testUserInnerID]
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

In [62]:
# Get similar items to stuff we liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating / 5.0)
    

In [63]:
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

In [64]:
movie_name = movies[['movieId','title']].set_index('movieId').to_dict()['title']

In [65]:
pos = 0
print( 'Top 10 based on Item Similarity : \n')
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movie_name[movieID] , ratingSum)
        pos += 1
        if (pos >= k):
            break

Top 10 based on Item Similarity : 

Tommy Boy (1995) 3.812339192157903
Space Jam (1996) 3.409761120054671
Santa Clause, The (1994) 3.277722727882635
Crocodile Dundee II (1988) 2.946914110485173
Casper (1995) 2.892300177142967
