In [1]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import accuracy

In [2]:
ratings_df  = pd.read_csv('../data/rating.csv')# read csv into ratings_df dataframe
ratings_df.drop('timestamp', inplace=True, axis=1)
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
20000258,138493,68954,4.5
20000259,138493,69526,4.5
20000260,138493,69644,3.0
20000261,138493,70286,5.0


In [3]:
ratings_df.drop_duplicates(subset =["userId",'movieId'], keep = False, inplace = True)
ratings_df.dropna(subset = ["userId"], inplace=True)
ratings_df.dropna(subset = ["movieId"], inplace=True)

In [4]:
ratings_df = ratings_df[:100000]

In [5]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
99995,702,1373,3.0
99996,702,1374,2.5
99997,702,1377,1.5
99998,702,1378,1.5


In [6]:
len(set(ratings_df['userId']))

702

In [7]:
len(set(ratings_df['movieId']))

8227

In [8]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(ratings_df,reader) #load dataset into Surprise datastructure Dataset

# Grid Search

In [9]:
param_grid = {'k': [5, 10, 20, 50],
              'sim_options': {'name': ['pearson', 'cosine','pearson_baseline'],
                              'min_support': [1, 5, 10],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [True, False]
                             }
              }

In [10]:
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5) 

In [11]:
gs.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

In [12]:
print(gs.best_score['rmse'])

0.9423366644700722


In [13]:
print(gs.best_params['rmse'])

{'k': 50, 'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}


# Precision at k

In [14]:
from collections import defaultdict
def precision_recall_at_k(predictions, k, threshold):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [15]:
from surprise.model_selection import KFold

kf = KFold(n_splits=5)
knn = KNNBasic(k = 50, sim_options= {'name': 'cosine', 'min_support': 1, 'user_based': False}, min_k =15)

for trainset, testset in kf.split(data):
    knn.fit(trainset)
    predictions = knn.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

Computing the cosine similarity matrix...
Done computing similarity matrix.
0.25846975975363995
0.16490884081205864
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.2472788306953785
0.16207797914207445
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.2508504212366874
0.16454933740970004
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.2333758503401361
0.16064070892108379
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.24924422285533407
0.16199916152207375


In [16]:
user_id = (ratings_df['userId'])
user_id = list(set(user_id))

In [17]:
user_dict = {}
for i in range(len(user_id)):
    user_dict[user_id[i]] = user_id[i]

In [18]:
movie_df = pd.read_csv("../data/movie.csv")

In [19]:
movie_dict = {}
for i in range(len(movie_df)):
    movie_dict[movie_df.iloc[i].movieId] = movie_df.iloc[i].title

In [20]:
trainset = data.build_full_trainset()

# Build an algorithm, and train it. Follow methodology provided previously
algo = KNNBasic(k = 50, sim_options= {'name': 'cosine', 'min_support': 1, 'user_based': False}, min_k =15)
algo.fit(trainset)

# Find missing values and predict
anti_test_set = trainset.build_anti_testset() 
predictions = algo.test(anti_test_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [21]:
from collections import defaultdict

def getMovieRecommendations(topN=3):
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions: 
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs 

In [22]:
recommendations = getMovieRecommendations(3)

In [23]:
def getMovieName(movie_id):
    if movie_id not in movie_dict:
        return movie_id
    m = movie_dict[movie_id]
    return m

In [24]:
def getMovieRecommendationsForUser(userId, recommendations):
    if userId not in user_dict:
        print("User id is not present")
        return
    u_id = user_dict[userId]
    recommended_movies = recommendations[u_id]
    movie_list = []
    for movie in recommended_movies:
        movie_list.append((getMovieName(movie[0]),movie[1]))
    return movie_list   

In [25]:
getMovieRecommendationsForUser(4,recommendations)

[('Alaska (1996)', 4.0),
 ('Stir Crazy (1980)', 3.934799902966015),
 ('Breach (2007)', 3.932476794052086)]

In [29]:
getMovieRecommendationsForUser(44,recommendations)

[('Man from Laramie, The (1955)', 4.534160049553868),
 ('Animal Farm (1954)', 4.533690924267822),
 ('Cowboys, The (1972)', 4.5332997335923295)]

In [35]:
list_ = []
for i in predictions:
    if i[0]==44 and i[3] > 4:
        list_.append(i[1])
len(list_)

6447