In [83]:
import pandas as pd
import numpy as np
import surprise
from surprise import Reader, Dataset, SVD, evaluate, SlopeOne, KNNBasic, KNNWithMeans
from surprise import accuracy
from sklearn.model_selection import train_test_split

# from surprise.model_selection import train_test_split
from collections import defaultdict

In [84]:
ratings = pd.read_csv('/Users/yashnisar/Downloads/ratings.csv', sep=',', usecols=['userId', 'movieId', 'rating','timestamp'])

## Splitting data such that for each user, randomly = 80% of his/her ratings as the training ratings, and use the remaining 20% ratings as testing ratings

In [85]:
train_set = pd.DataFrame(columns = ['userId', 'movieId', 'rating','timestamp'])
test_set = pd.DataFrame(columns = ['userId', 'movieId', 'rating','timestamp'])
for user in ratings['userId'].unique():
    temp = ratings.loc[ratings['userId'] == user]
    train, test = train_test_split(temp, test_size=0.2)
    train_set = train_set.append(train, ignore_index=True)
    test_set = test_set.append(test, ignore_index=True)

In [86]:
reader = Reader()
data = Dataset.load_from_df(train_set[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [87]:
def recommend_items(predictions):
    results = {}
    for uid, iid, true_r, est, details in predictions:
        if uid in results:
            results[uid].append((iid, est))
        else:
            results[uid] = [(iid, est)]

    for uid in results:
        results[uid].sort(key=lambda x: x[1], reverse=True)
        results[uid] = results[uid][:10]

    return results

In [88]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
   
    user_dict = {}
    for uid, iid, true_r, est, details in predictions:
        if uid in user_dict:
            user_dict[uid].append((est, true_r))
        else:
            user_dict[uid] = [(est, true_r)]

    precisions = {}
    recalls = {}
    for uid in user_dict:
        user_dict[uid].sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_dict[uid])
        n_rec_k = sum((est >= threshold) for (est, _) in user_dict[uid][:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_dict[uid][:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [89]:
def norm_dcg_score(predictions, k=10, gains="exponential"):
    
    user_dict = {}
    for uid, iid, true_r, est, details in predictions:
        if uid in user_dict:
            user_dict[uid].append((est, true_r))
        else:
            user_dict[uid] = [(est, true_r)]

    dcg_score = {}
    norm_dcg_score = {}
    for uid in user_dict:
        est = []
        true_r = []
        user_dict[uid].sort(key=lambda x: x[0], reverse=True)
        user_dict[uid] = user_dict[uid][:k]
        
        for x, y in user_dict[uid]:
            est.append(x)
            true_r.append(y)

        est = np.array(est)
        true_r = np.array(true_r)
        
        order = np.argsort(est)[::-1]
        y_true = np.take(true_r, order[:k])
        
        if gains == "exponential":
            gain_val = 2 ** y_true - 1
        elif gains == "linear":
            gain_val = y_true
        else:
            raise ValueError("Invalid gains option.")
        
        discounts = np.log2(np.arange(len(y_true)) + 2)
        dcg_score[uid] = np.sum(gain_val / discounts)
        
        
        #-------code for norm dcg score---------------------------------
        order = np.argsort(true_r)[::-1]
        y_true = np.take(true_r, order[:k])
        
        if gains == "exponential":
            gain_val = 2 ** y_true - 1
        elif gains == "linear":
            gain_val = y_true
        else:
            raise ValueError("Invalid gains option.")
        
        discounts = np.log2(np.arange(len(y_true)) + 2)
        norm_dcg_score[uid] = dcg_score[uid]/np.sum(gain_val / discounts)
    
    return norm_dcg_score

In [90]:
algo_names = ['SVD', 'SlopeOne', 'KNN Basic','KNN with Means']

In [91]:
for index, algo in enumerate([SVD(), SlopeOne(), KNNBasic(), KNNWithMeans()]):
    print("\n Algorithm:", algo_names[index])
    algo.fit(trainset)
    predictions = []
    for _, row in test_set.iterrows():
        predictions.append(algo.predict(row.userId, row.movieId, row.rating))
    print("RMSE", accuracy.rmse(predictions, verbose=False))
    print("MAE", accuracy.mae(predictions, verbose=False))
    # print(predictions)
    top_recommendations = recommend_items(predictions)
    norm_dcg = norm_dcg_score(predictions, k=10, gains="exponential")
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
    avg_norm_dcg = sum(ndcg for ndcg in norm_dcg.values()) / len(norm_dcg)
    avg_prec = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    fscore = 2 * (avg_prec * avg_recall)/ (avg_prec + avg_recall)
    print("Fscore:", fscore)
    print("Average Precision:", avg_prec)
    print("Average Recall:", avg_recall)
    print("Average Normalized Discounted Cumulative Gain:", avg_norm_dcg)
    print("Top recommendations: ", top_recommendations)


 Algorithm: SVD
RMSE 0.8705225005589106
MAE 0.6686508032264264
Fscore: 0.4542160346016836
Average Precision: 0.8451353109549831
Average Recall: 0.3105639618067222
Average Normalized Discounted Cumulative Gain: 0.8880193894772914
Top recommendations:  {1: [(608, 4.9011004992140395), (1967, 4.86977091687895), (1089, 4.860467371798719), (1265, 4.812860549110109), (1222, 4.766872539430366), (3578, 4.711305493358913), (1219, 4.697322302699699), (1206, 4.66059182562638), (2078, 4.660263792843132), (2947, 4.560454122988887)], 2: [(48516, 4.470894974121505), (58559, 4.302013414008627), (79132, 4.188958078905497), (89774, 3.6407958960283655), (114060, 3.6026737843102827), (86345, 3.4667376333018645)], 3: [(527, 3.6888356640239435), (3949, 3.329555287954452), (1302, 2.9357392285798523), (2090, 2.8087239442558354), (2424, 2.768872443691882), (4518, 2.691929202920465), (72378, 2.5983363547372766), (6238, 2.59637002303937)], 4: [(1197, 4.388452472646577), (1968, 4.08859870490677), (908, 4.02471649

RMSE 0.8976484273389327
MAE 0.6837873599772266
Fscore: 0.47653287776710457
Average Precision: 0.7990892531876137
Average Recall: 0.33949430346088394
Average Normalized Discounted Cumulative Gain: 0.8824223311938523
Top recommendations:  {1: [(3729, 5), (1089, 5), (608, 5), (1219, 5), (1206, 5), (47, 4.914987868807944), (1222, 4.911085891003294), (1265, 4.856050072704564), (1967, 4.782600353733835), (2947, 4.659310906442049)], 2: [(86345, 4.61441647597254), (58559, 4.320537870443847), (48516, 4.255108005466877), (79132, 4.252962970454847), (89774, 3.8898550724637677), (114060, 3.4252717391304346)], 3: [(527, 3.8059544355627306), (3949, 3.6613813829539636), (2090, 3.1494892473118283), (1302, 3.1353580651947124), (2424, 2.641841011709047), (72378, 2.4496975806451613), (6238, 1.7020872865275145), (4518, 1.0760368663594473)], 4: [(4273, 4.58515731874145), (898, 4.183320121344118), (2203, 4.150230057514379), (1136, 4.064903304087121), (1197, 3.9872515162537017), (1213, 3.9111874127055364), (

Done computing similarity matrix.
RMSE 0.9394712949545727
MAE 0.7207942410488231
Fscore: 0.47198049045085433
Average Precision: 0.7627914389799633
Average Recall: 0.3417063986848416
Average Normalized Discounted Cumulative Gain: 0.8882902581244204
Top recommendations:  {1: [(3578, 4.352691877366649), (1222, 4.332165679479434), (1089, 4.325141768018866), (1219, 4.302451877793645), (608, 4.280566766651385), (47, 4.2803492911791405), (1206, 4.269968425002696), (1927, 4.234519459576677), (1265, 4.2212488941703725), (480, 4.068172578230759)], 2: [(48516, 4.318508502622569), (58559, 4.222127188436101), (79132, 4.133146709500859), (89774, 3.480232949498817), (86345, 3.3864060320401306), (114060, 3.2583892617449663)], 3: [(527, 4.286708951236668), (3949, 3.761813206259339), (2090, 3.6145142009836406), (1302, 3.579548549309936), (2424, 3.1635174507918737), (4518, 2.9686648501362396), (6238, 2.9077280741625926), (72378, 2.608417429023452)], 4: [(4273, 5), (1733, 4.473307649972481), (898, 4.44796

Done computing similarity matrix.
RMSE 0.8931567491483258
MAE 0.6817166252395642
Fscore: 0.4874880873166524
Average Precision: 0.8283300806661458
Average Recall: 0.3453734960534017
Average Normalized Discounted Cumulative Gain: 0.8786941238041117
Top recommendations:  {1: [(3740, 4.897572059024423), (3729, 4.859957781084542), (1219, 4.8407707845900205), (1222, 4.838449572513172), (940, 4.814152921110558), (1206, 4.794724985691668), (1967, 4.776595588092809), (2596, 4.75295072937716), (1089, 4.694541236431954), (2078, 4.694331889004188)], 2: [(48516, 4.526310254852732), (58559, 4.455904845462433), (79132, 4.386155562336086), (89774, 4.049857707622501), (86345, 4.02905587940208), (114060, 3.914753649126285)], 3: [(527, 3.4374801507488906), (3949, 3.215123192480142), (2090, 3.0953015992012736), (1302, 2.8657148910335977), (2424, 2.494756077296455), (6238, 2.3768678227435385), (4518, 2.2498497608884382), (72378, 2.036439434481681)], 4: [(4273, 4.790848686197523), (2203, 4.3114320994162085)