# Models Hyperparameters tuning

In [1]:
"""
Created on Tue July 21 8:15:52 2020

@author: Muhammad Imran Shaikh
"""

'\nCreated on Tue July 21 8:15:52 2020\n\n@author: Muhammad Imran Shaikh\n'

## Loading Libraries

In [4]:
import io 
import os
import csv
from surprise import Reader # Reader Function
import sys
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import accuracy

## Loading Data

In [5]:
class MovieLens:

    movieID_to_name = {}
    name_to_movieID = {}
    ratingsPath = '/content/ratings.csv'
    moviesPath = '/content/movies.csv'
    
    def loadMovieLensLatestSmall(self):

        os.chdir(os.path.dirname(sys.argv[0])) # Look for files relative to the directory we are running from

        ratingsDataset = 0
        self.movieID_to_name = {}
        self.name_to_movieID = {}

        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        with open(self.moviesPath, newline='') as csvfile:
                movieReader = csv.reader(csvfile)
                next(movieReader)  #Skip header line
                for row in movieReader:
                    movieID = int(row[0])
                    movieName = row[1]
                    self.movieID_to_name[movieID] = movieName
                    self.name_to_movieID[movieName] = movieID

        return ratingsDataset
    
    # Fetching Movie name based on movie id
    def getMovieName(self, movieID):
        if movieID in self.movieID_to_name:
            return self.movieID_to_name[movieID]
        else:
            return ""

In [6]:
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()

# Training and testing Data

In [7]:
trainset, testset = train_test_split(data, test_size=.25)

# Using GridSearchCV for algorithms Tuning 

### Content based filtering Using KNNBaseline

In [None]:
# Setting parameters for Content based Filtering algorithm
param_grid = {'k': [10, 20 , 30],
              'bsl_options': {'method': ['sgd'],
                              'n_epochs': [1, 3, 5],
                              'learning_rate': [.00005,.00003]
                              },
              'sim_options': {'name': ['pearson_baseline', 'cosine'],
                              'user_based': [False]}
              }

gs_KBL = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)
gs_KBL.fit(data)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done comput

Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix.

In [None]:
# We can now use the algorithm that yields the best rmse:
algo = gs_KBL.best_estimator['rmse']  # pass the best model to algo
print(gs_KBL.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_KBL.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


0.9215554102860016
{'k': 30, 'bsl_options': {'method': 'sgd', 'n_epochs': 5, 'learning_rate': 5e-05}, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9057  0.8962  0.9166  0.9053  0.9208  0.9089  0.0088  
MAE (testset)     0.6816  0.6767  0

{'test_rmse': array([0.90567327, 0.89620608, 0.91657182, 0.90531585, 0.92081002]),
 'test_mae': array([0.68161864, 0.67674076, 0.69248837, 0.68311319, 0.69005613]),
 'fit_time': (12.891018629074097,
  13.149861574172974,
  12.930994272232056,
  12.918000936508179,
  12.915004253387451),
 'test_time': (7.919735908508301,
  7.507350921630859,
  7.482368469238281,
  8.064008951187134,
  7.743207693099976)}

In [None]:
# Use trainset with the new parameters
bsl_options = {'method': 'sgd',
               'n_epochs': 5,
               'learning_rate': .00001
               }

sim_options = {'name': 'pearson_baseline',
               'user_based': False
              }

algo_KBL = KNNBaseline(k=30, bsl_options=bsl_options, sim_options=sim_options)
algo_KBL.fit(trainset)
test_pred_KBL = algo_KBL.test(testset)
print("Content Based Filtering : Test Set")
accuracy.rmse(test_pred_KBL, verbose=True)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Content Based Filtering : Test Set
RMSE: 0.9116


0.9115814248534567

# Collaborative Filtering

## User-based Collaborative Filtering

In [None]:
# Setting parameters for User based collaborative Filtering algorithm
param_grid_KWM_UB = {'k': [10, 20, 50], 'n_epochs': [1, 3, 5],
                              'learning_rate': [.00001,.00003,.00005],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [True]}
              }
gs_KWM_UB = GridSearchCV(KNNWithMeans, param_grid_KWM_UB, measures=['rmse', 'mae'], cv=3)
gs_KWM_UB.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity m

In [None]:
# We can now use the algorithm that yields the best rmse:
algo_KWM_UB = gs_KWM_UB.best_estimator['rmse']  # pass the best model to algo
print(gs_KWM_UB.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_KWM_UB.best_params['rmse'])
cross_validate(algo_KWM_UB, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

0.9269129465163269
{'k': 50, 'n_epochs': 1, 'learning_rate': 1e-05, 'sim_options': {'name': 'msd', 'min_support': 1, 'user_based': True}}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9190  0.9186  0.9200  0.9203  0.9252  0.9206  0.0024  
MAE (testset)     0.7034  0.7019  0.7036  0.7070  0.7053  0.7042  0.0017  
Fit time          0.27    0.31    0.29    0.30    0.31    0.30    0.01    
Test time         2.20    2.14    2.36    2.05    2.18    2.19    0.10    


{'test_rmse': array([0.91895363, 0.91858846, 0.91997131, 0.92029215, 0.92515162]),
 'test_mae': array([0.70340975, 0.70189836, 0.70359591, 0.70699692, 0.7052642 ]),
 'fit_time': (0.2728304862976074,
  0.3088066577911377,
  0.2908186912536621,
  0.29981303215026855,
  0.30580782890319824),
 'test_time': (2.1966395378112793,
  2.1356797218322754,
  2.3635663986206055,
  2.0537304878234863,
  2.183649778366089)}

In [None]:
sim_options = {'name': 'cosine',
               'user_based': True
              }

algo_KWM_U = KNNWithMeans(k=30, sim_options=sim_options)
algo_KWM_U.fit(trainset)
test_pred_KWM_U = algo_KWM_U.test(testset)
print("User based Collaborative Filtering : Test Set")
accuracy.rmse(test_pred_KWM_U, verbose=True)

## Item-based Collaborative Filtering

In [None]:
# Setting parameters for Item based collaborative Filtering algorithm
param_grid_KWM_IB = {'k': [10, 20, 30], 'n_epochs': [1, 3, 5],
                              'learning_rate': [.00005],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }
gs_KWM_IB = GridSearchCV(KNNWithMeans, param_grid_KWM_IB, measures=['rmse', 'mae'], cv=3)
gs_KWM_IB.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

In [None]:
# We can now use the algorithm that yields the best rmse:
algo_KWM_IB = gs_KWM_IB.best_estimator['rmse']  # pass the best model to algo
print(gs_KWM_IB.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_KWM_IB.best_params['rmse'])
cross_validate(algo_KWM_IB, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

0.9252396456269577
{'k': 30, 'n_epochs': 1, 'learning_rate': 5e-05, 'sim_options': {'name': 'msd', 'min_support': 1, 'user_based': False}}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9182  0.9213  0.9102  0.9184  0.9200  0.9176  0.0039  
MAE (testset)     0.6998  0.7055  0.7000  0.7055  0.7049  0.7031  0.0027  
Fit time          5.61    5.43    5.11    4.96    5.15    5.25    0.24    
Test time         8.75    8.32    8.75    8.71    8.86    8.68    0.18    


{'test_rmse': array([0.9181764 , 0.92125791, 0.9101605 , 0.91839832, 0.92000576]),
 'test_mae': array([0.69976229, 0.70545575, 0.69997366, 0.70554032, 0.70488343]),
 'fit_time': (5.611497640609741,
  5.425121784210205,
  5.1080076694488525,
  4.957285165786743,
  5.150495529174805),
 'test_time': (8.750431299209595,
  8.321145057678223,
  8.748647212982178,
  8.714674472808838,
  8.857162237167358)}

In [None]:
print(gs_KWM_IB.best_score['mae'])

0.7086727698563725


In [None]:
sim_options = {'name': 'cosine',
               'user_based': True
              }

algo_KWM_I = KNNWithMeans(k=30, sim_options=sim_options)
algo_KWM_I.fit(trainset)
test_pred_KWM_I = algo_KWM_I.test(testset)
print("Item based Collaborative Filtering : TestSet")
accuracy.rmse(test_pred_KWM_I, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Content Based Filtering : Test Set
RMSE: 0.9223


0.9223009280354765

# Matrix Factorization

## SVD

In [12]:
param_grid_svd = {'n_factors': [20, 30, 40], 'n_epochs': [20, 30, 40], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'], cv=3)
gs_svd.fit(data)
algo_svd = gs_svd.best_estimator['rmse']
print(gs_svd.best_score['rmse'])
print(gs_svd.best_params['rmse'])
cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

0.8872329177166806
{'n_factors': 40, 'n_epochs': 40, 'lr_all': 0.008, 'reg_all': 0.1}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8772  0.8853  0.8776  0.8776  0.8805  0.8796  0.0031  
MAE (testset)     0.6734  0.6831  0.6751  0.6747  0.6758  0.6764  0.0034  
Fit time          4.93    4.99    4.75    4.90    4.85    4.88    0.08    
Test time         0.13    0.14    0.19    0.19    0.13    0.15    0.03    


{'fit_time': (4.934303045272827,
  4.989953517913818,
  4.750361442565918,
  4.8974151611328125,
  4.851241827011108),
 'test_mae': array([0.67341237, 0.68306587, 0.67506472, 0.67466213, 0.67579885]),
 'test_rmse': array([0.87715453, 0.88526796, 0.87762364, 0.87755278, 0.88045096]),
 'test_time': (0.1259326934814453,
  0.13767528533935547,
  0.19074273109436035,
  0.19427180290222168,
  0.12610077857971191)}

In [13]:
# Use the new parameters with the train data on SVD algorithm
algo_svd = SVD(n_factors=40, n_epochs=40, lr_all=0.005, reg_all=0.1)
algo_svd.fit(trainset)
test_pred = algo_svd.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)


SVD : Test Set
RMSE: 0.8886


0.88861112602393

## SVD++

In [None]:
# Setting parameters for Matrix factorization SVDpp algorithm
param_grid_svdpp = {'n_factors': [20, 30, 40], 'n_epochs': [20,30,40], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs_svdpp = GridSearchCV(SVDpp, param_grid_svdpp, measures=['rmse', 'mae'], cv=3)
gs_svdpp.fit(data)
algo_svdpp = gs_svdpp.best_estimator['rmse']
print(gs_svdpp.best_score['rmse'])
print(gs_svdpp.best_params['rmse'])
cross_validate(algo_svdpp, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [9]:
# Use the new parameters with the train data on SVD++ algorithm
algo_svdpp = SVDpp(n_factors=40, n_epochs=40, lr_all=0.008, reg_all=0.1)
algo_svdpp = SVDpp()
algo_svdpp.fit(trainset)
test_pred = algo_svdpp.test(testset)
print("SVD++ : Test Set")
accuracy.rmse(test_pred, verbose=True)

SVD++ : Test Set
RMSE: 0.8821


0.8820609964641091