In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')
%matplotlib inline

In [2]:
from data_preparation import *

In [19]:
from surprise import SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.search import GridSearchCV

In [4]:
(user_item_train, n_users_train, n_items_train), (user_item_test, n_users_test, n_items_test), movie_data = data_preparation_pipeline()

In [5]:
movie_data_3_cols = movie_data[['userId', 'movieId', 'rating']]
movie_data_3_cols.head()

Unnamed: 0,userId,movieId,rating
0,1,1,5.0
1,1,2,4.5
2,1,3,4.0
3,1,4,3.5
4,1,5,5.0


In [6]:
reader = Reader()
movie_data_sur = Dataset.load_from_df(movie_data_3_cols, reader=reader)


In [23]:
svd = SVD(n_factors=200, n_epochs=50, lr_all=1e-2, reg_all=1e-1)
# perf_svd = cross_validate(svd, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)
parameters = {'n_factors':[50, 100, 200, 500], 'n_epochs':[5, 10, 20, 30], 'lr_all':[5e-4, 1e-3, 3e-3, 5e-3, 1e-2], 'reg_all':[5e-3, 2e-2, 5e-2, 1e-1]}
svd = GridSearchCV(SVD, parameters, n_jobs=-1, joblib_verbose=1, measures=['rmse', 'mae'])
svd.fit(movie_data_sur)
(svd.best_score['rmse'], svd.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 30.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 53.9min
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 108.7min finished


(0.8553407824966703,
 {'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1})

In [32]:
svd = SVD(n_factors=200, n_epochs=50, lr_all=1e-2, reg_all=1e-1)
perf_svd = cross_validate(svd, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8675  0.8592  0.8627  0.8631  0.0034  
MAE (testset)     0.6648  0.6595  0.6604  0.6616  0.0023  
Fit time          16.61   17.69   17.60   17.30   0.49    
Test time         0.22    0.41    0.22    0.28    0.09    


In [9]:
svdpp = SVDpp()
perf_svdpp = cross_validate(svdpp, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8713  0.8748  0.8734  0.8732  0.0014  
MAE (testset)     0.6647  0.6718  0.6689  0.6685  0.0029  
Fit time          296.26  317.16  317.12  310.18  9.84    
Test time         8.73    9.79    9.99    9.50    0.55    


In [10]:
nmf = NMF()
perf_nmf = cross_validate(nmf, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9401  0.9386  0.9401  0.9396  0.0007  
MAE (testset)     0.7228  0.7230  0.7200  0.7219  0.0014  
Fit time          5.58    5.55    5.55    5.56    0.01    
Test time         0.22    0.21    0.29    0.24    0.04    


In [11]:
s_one = SlopeOne()
perf_s_one = cross_validate(s_one, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9093  0.9135  0.8961  0.9063  0.0074  
MAE (testset)     0.6996  0.6989  0.6877  0.6954  0.0055  
Fit time          3.27    3.31    3.20    3.26    0.04    
Test time         6.67    6.35    6.45    6.49    0.14    


In [12]:
knn_basic = KNNBasic()
perf_knn_basic = cross_validate(knn_basic, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9743  0.9730  0.9631  0.9701  0.0050  
MAE (testset)     0.7496  0.7474  0.7426  0.7465  0.0029  
Fit time          0.17    0.26    0.19    0.21    0.04    
Test time         2.31    2.49    2.25    2.35    0.10    


In [13]:
knnm = KNNWithMeans()
perf_knnm = cross_validate(knnm, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9090  0.8967  0.8970  0.9009  0.0057  
MAE (testset)     0.6972  0.6886  0.6885  0.6914  0.0041  
Fit time          0.19    0.21    0.22    0.21    0.01    
Test time         2.50    2.57    2.72    2.60    0.09    


In [14]:
knn_base = KNNBaseline()
perf_knn_base = cross_validate(knn_base, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8850  0.8800  0.8905  0.8852  0.0043  
MAE (testset)     0.6775  0.6747  0.6823  0.6782  0.0031  
Fit time          0.40    0.42    0.40    0.41    0.01    
Test time         2.98    3.14    3.09    3.07    0.07    


In [15]:
ccluster = CoClustering()
perf_ccluster = cross_validate(ccluster, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9508  0.9603  0.9487  0.9533  0.0051  
MAE (testset)     0.7403  0.7475  0.7399  0.7425  0.0035  
Fit time          2.52    2.46    2.46    2.48    0.03    
Test time         0.17    0.29    0.16    0.21    0.06    


In [16]:
baseline = BaselineOnly()
perf_baseline = cross_validate(baseline, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8797  0.8769  0.8723  0.8763  0.0030  
MAE (testset)     0.6770  0.6762  0.6705  0.6746  0.0029  
Fit time          0.23    0.24    0.23    0.23    0.00    
Test time         0.15    0.28    0.15    0.19    0.06    


In [17]:
random = NormalPredictor()
perf_random = cross_validate(random, movie_data_sur, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4133  1.4123  1.4083  1.4113  0.0021  
MAE (testset)     1.1230  1.1260  1.1231  1.1240  0.0014  
Fit time          0.10    0.11    0.11    0.10    0.01    
Test time         0.35    0.38    0.22    0.32    0.07    


In [18]:
print(perf_svd)
print(perf_svdpp)
print(perf_nmf)
print(perf_s_one)
print(perf_knn_basic)
print(perf_knnm)
print(perf_knn_base)
print(perf_ccluster)
print(perf_baseline)
print(perf_random)

{'test_rmse': array([0.88205183, 0.87686378, 0.88822508]), 'test_mae': array([0.67663982, 0.67482192, 0.68094366]), 'fit_time': (4.161578178405762, 4.121702432632446, 3.8688015937805176), 'test_time': (0.2379605770111084, 0.20350933074951172, 0.2650628089904785)}
{'test_rmse': array([0.87129123, 0.87482021, 0.87339535]), 'test_mae': array([0.66474133, 0.67177045, 0.66890854]), 'fit_time': (296.2581639289856, 317.15815806388855, 317.12257146835327), 'test_time': (8.731106281280518, 9.786488056182861, 9.989471435546875)}
{'test_rmse': array([0.94010931, 0.93864227, 0.94010386]), 'test_mae': array([0.7227882 , 0.72295063, 0.72000433]), 'fit_time': (5.578952074050903, 5.553024768829346, 5.5501625537872314), 'test_time': (0.21967554092407227, 0.21067571640014648, 0.29061388969421387)}
{'test_rmse': array([0.90929029, 0.91346789, 0.89612937]), 'test_mae': array([0.6996393 , 0.69890275, 0.68766585]), 'fit_time': (3.272859811782837, 3.3075997829437256, 3.203626871109009), 'test_time': (6.67494

Pretty printing has been turned ON
