In [2]:
import pickle
import os
import pandas as pd
from surprise import SVD, SVDpp
from surprise import KNNBasic, KNNBaseline, BaselineOnly
from surprise import Dataset                                                     
from surprise import Reader                                                      
from surprise import dump
from surprise.accuracy import rmse

In [3]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [4]:
file_path_train = './data/train.csv'
file_path_test = './data/test.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [5]:
print(traindf.head())

   userId  movieId  rating  timestamp  \
0       1        1     4.0  964982703   
1       1        3     4.0  964981247   
2       1        6     4.0  964982224   
3       1       70     3.0  964982400   
4       1      101     5.0  964980868   

                                              genres tag  
0  ['Adventure', 'Animation', 'Children', 'Comedy...  []  
1                              ['Comedy', 'Romance']  []  
2                    ['Action', 'Crime', 'Thriller']  []  
3         ['Action', 'Comedy', 'Horror', 'Thriller']  []  
4        ['Adventure', 'Comedy', 'Crime', 'Romance']  []  


In [6]:
algo_svd = SVD()     
algo_svdpp = SVDpp()                                    
algo_knn = KNNBasic()

algo_svd.fit(trainset)                             
predictions_svd = algo_svd.test(testset)

algo_svdpp.fit(trainset)                             
predictions_svdpp = algo_svdpp.test(testset)

algo_knn.fit(trainset)
predictions_knn = algo_knn.test(testset)

# rmse(predictions_svd)
# rmse(predictions_knn)                                                                           

dump.dump('./dump_SVD', predictions_svd, algo_svd)
dump.dump('./dump_SVDpp', predictions_svdpp, algo_svdpp)
dump.dump('./dump_KNN', predictions_knn, algo_knn)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [7]:
df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details'])        
df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details']) 

In [8]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False  # compute  similarities between items
               }
# algo = KNNBaseline(sim_options=sim_options)
algo_knnbaseline = KNNBaseline(sim_options=sim_options)
algo_knnbaseline.fit(trainset)
predictions_knnbaseline = algo_knnbaseline.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [9]:
df_knnbaseline = pd.DataFrame(predictions_knnbaseline, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_knnbaseline['err'] = abs(df_knnbaseline.est - df_knnbaseline.rui)
df_knnbaseline['sqr_err'] = (df_knnbaseline.est - df_knnbaseline.rui)**2

In [10]:
df_svd['err'] = abs(df_svd.est - df_svd.rui)
df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)
df_knn['err'] = abs(df_knn.est - df_knn.rui)     

In [11]:
df_svd['sqr_err'] = (df_svd.est - df_svd.rui)**2
df_svdpp['sqr_err'] = (df_svdpp.est - df_svdpp.rui)**2
df_knn['sqr_err'] = (df_knn.est - df_knn.rui)**2  

In [12]:
algo_baselineonly = BaselineOnly()
algo_baselineonly.fit(trainset)
predictions_baselineonly = algo_baselineonly.test(testset)

Estimating biases using als...


In [13]:
def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return traindf[traindf['userId'] == uid].shape[0]
    except ValueError:  # user was not part of the trainset
        return 0

In [14]:
df_baselineonly = pd.DataFrame(predictions_baselineonly, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_baselineonly['err'] = abs(df_baselineonly.est - df_baselineonly.rui)
df_baselineonly['sqr_err'] = (df_baselineonly.est - df_baselineonly.rui)**2
df_baselineonly['Iu'] = df_baselineonly.uid.apply(get_Iu)

In [15]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True  # compute  similarities between items
               }
algo_knnbaseline_user = KNNBaseline(sim_options=sim_options)
algo_knnbaseline_user.fit(trainset)
predictions_knnbaseline_user = algo_knnbaseline_user.test(testset)

df_knn_user = pd.DataFrame(predictions_knnbaseline_user, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_knn_user['err'] = abs(df_knn_user.est - df_knn_user.rui)
df_knn_user['sqr_err'] = (df_knn_user.est - df_knn_user.rui)**2
df_knn_user['Iu'] = df_knn_user.uid.apply(get_Iu)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [16]:
#df_svd.head()
content = pd.read_csv('./data/movie_era_based_genre_ratings.csv')

In [17]:
content['Iu'] = content.userId.apply(get_Iu)

In [18]:
content['err'] = abs(content.pred_rating - content.og_rating)
content['sqr_err'] = (content.pred_rating - content.og_rating)**2
# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5
# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()

In [19]:
print("Movie Era based                 ",content[content.Iu < 18].err.mean())
print("Movie Era based                ",content[content.Iu < 18].sqr_err.mean()** .5)

Movie Era based                  7.981078721545614
Movie Era based                 8.371531888942888


In [20]:
content = pd.read_csv('./data/movie_year_based_genre_ratings.csv')
content['Iu'] = content.userId.apply(get_Iu)
content['err'] = abs(content.pred_rating - content.og_rating)
content['sqr_err'] = (content.pred_rating - content.og_rating)**2
# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5
# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()

In [21]:
print("Movie Year based                 ",content[content.Iu < 18].err.mean())
print("Movie Year based                ",content[content.Iu < 18].sqr_err.mean()** .5)

Movie Year based                  0.9339830889246196
Movie Year based                 1.2688832561653924


In [22]:
content = pd.read_csv('./data/content_based_genre_ratings.csv')
content['Iu'] = content.userId.apply(get_Iu)
content['err'] = abs(content.pred_rating - content.og_rating)
content['sqr_err'] = (content.pred_rating - content.og_rating)**2
# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5
# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()

In [23]:
print("Content based                 ",content[content.Iu < 18].err.mean())
print("Content based                ",content[content.Iu < 18].sqr_err.mean()** .5)

Content based                  0.827501672245648
Content based                 1.100062700607275


In [24]:
df_knn['Iu'] = df_knn.uid.apply(get_Iu)
df_svd['Iu'] = df_svd.uid.apply(get_Iu)
df_svdpp['Iu'] = df_svdpp.uid.apply(get_Iu)
df_knnbaseline['Iu'] = df_knnbaseline.uid.apply(get_Iu)

In [25]:
print("--------------------------MAE-----------------------")
print("KNN Basic                 ",df_knn[df_knn.Iu < 18].err.mean())
print("SVD                       ", df_svd[df_svd.Iu < 18].err.mean())
print("SVDpp                     ",  df_svdpp[df_svdpp.Iu < 18].err.mean())
print("KNN Baseline (item-item)  ", df_knnbaseline[df_knnbaseline.Iu < 18].err.mean())
print("BaselineOnly              ",df_baselineonly[df_baselineonly.Iu < 18].err.mean() )
print("KNN Baseline (user-user)  ",df_knn_user[df_knn_user.Iu < 18].err.mean() )

--------------------------MAE-----------------------
KNN Basic                  0.860741703497014
SVD                        0.78570893327357
SVDpp                      0.7690157799172334
KNN Baseline (item-item)   0.7737077437833596
BaselineOnly               0.8010726412517467
KNN Baseline (user-user)   0.8012690404978389


In [26]:
print("--------------------------RMSE-----------------------")
print("KNN Basic                ",df_knn[df_knn.Iu < 18].sqr_err.mean()** .5)
print("SVD                      ", df_svd[df_svd.Iu < 18].sqr_err.mean()** .5)
print("SVDpp                    ",  df_svdpp[df_svdpp.Iu < 18].sqr_err.mean()** .5)
print("KNN Baseline (item-item) ", df_knnbaseline[df_knnbaseline.Iu < 18].sqr_err.mean()** .5)
print("BaselineOnly             ",df_baselineonly[df_baselineonly.Iu < 18].sqr_err.mean()** .5 )
print("KNN Baseline (user-user) ",df_knn_user[df_knn_user.Iu < 18].sqr_err.mean()** .5)

--------------------------RMSE-----------------------
KNN Basic                 1.1314625487445116
SVD                       1.014166885413943
SVDpp                     0.9866917408413298
KNN Baseline (item-item)  1.0235535421728008
BaselineOnly              1.0290179273136122
KNN Baseline (user-user)  1.0451593116040523


In [27]:
print("--------------------------MAE-----------------------")
print("KNN Basic                 ",df_knn[df_knn.Iu > 1000].err.mean())
print("SVD                       ", df_svd[df_svd.Iu > 1000].err.mean())
print("SVDpp                     ",  df_svdpp[df_svdpp.Iu > 1000].err.mean())
print("KNN Baseline (item-item)  ", df_knnbaseline[df_knnbaseline.Iu > 1000].err.mean())
print("BaselineOnly              ",df_baselineonly[df_baselineonly.Iu > 1000].err.mean() )
print("KNN Baseline (user-user)  ",df_knn_user[df_knn_user.Iu > 1000].err.mean() )

--------------------------MAE-----------------------
KNN Basic                  0.6770059296849504
SVD                        0.5946544077399245
SVDpp                      0.5600646224501096
KNN Baseline (item-item)   0.5936789032801404
BaselineOnly               0.5912259768198899
KNN Baseline (user-user)   0.5537933404473195


In [28]:
print("--------------------------RMSE-----------------------")
print("KNN Basic                ",df_knn[df_knn.Iu > 1000].sqr_err.mean()** .5)
print("SVD                      ", df_svd[df_svd.Iu > 1000].sqr_err.mean()** .5)
print("SVDpp                    ",  df_svdpp[df_svdpp.Iu > 1000].sqr_err.mean()** .5)
print("KNN Baseline (item-item) ", df_knnbaseline[df_knnbaseline.Iu > 1000].sqr_err.mean()** .5)
print("BaselineOnly             ",df_baselineonly[df_baselineonly.Iu > 1000].sqr_err.mean()** .5 )
print("KNN Baseline (user-user) ",df_knn_user[df_knn_user.Iu > 1000].sqr_err.mean()** .5)

--------------------------RMSE-----------------------
KNN Basic                 0.863920188096423
SVD                       0.7570413461300195
SVDpp                     0.7186015351292833
KNN Baseline (item-item)  0.7592298107731127
BaselineOnly              0.7485422640757043
KNN Baseline (user-user)  0.7138672413993307


In [29]:
iid_df = traindf.groupby(['userId'],as_index=False).movieId.count()
iid_df.movieId.max()

1269