# Hyperparameter tuning

## Loading Libraries

In [24]:
import io 
import os
import csv
from surprise import Reader # Reader Function
import sys
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import accuracy

## Loading Data

In [25]:
class MovieLens:

    movieID_to_name = {}
    name_to_movieID = {}
    ratingsPath = 'C:/Users/Imran/Desktop/Thesis coding/New_Coding/ml-latest-small/ratings.csv'
    moviesPath = 'C:/Users/Imran/Desktop/Thesis coding/New_Coding/ml-latest-small/movies.csv'
    
    def loadMovieLensLatestSmall(self):

        os.chdir(os.path.dirname(sys.argv[0])) # Look for files relative to the directory we are running from

        ratingsDataset = 0
        self.movieID_to_name = {}
        self.name_to_movieID = {}

        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
                movieReader = csv.reader(csvfile)
                next(movieReader)  #Skip header line
                for row in movieReader:
                    movieID = int(row[0])
                    movieName = row[1]
                    self.movieID_to_name[movieID] = movieName
                    self.name_to_movieID[movieName] = movieID

        return ratingsDataset
    
    # Fetching Movie name based on movie id
    def getMovieName(self, movieID):
        if movieID in self.movieID_to_name:
            return self.movieID_to_name[movieID]
        else:
            return ""

In [26]:
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()

In [27]:
trainset, testset = train_test_split(data, test_size=.25)

# Using GridSearchCV for algorithms Tuning 

### Content based filtering Using KNNBaseline

In [28]:
param_grid = {'k': [10, 20 , 30],
              'bsl_options': {'method': ['sgd'],
                              'n_epochs': [1, 3, 5],
                              'learning_rate': [.00005,.00003]
                              },
              'sim_options': {'name': ['pearson_baseline', 'cosine'],
                              'user_based': [False]}
              }

gs_KBL = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)
gs_KBL.fit(data)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done comput

Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix.

In [29]:
# We can now use the algorithm that yields the best rmse:
algo = gs_KBL.best_estimator['rmse']  # pass the best model to algo
print(gs_KBL.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_KBL.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


0.9215554102860016
{'k': 30, 'bsl_options': {'method': 'sgd', 'n_epochs': 5, 'learning_rate': 5e-05}, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9057  0.8962  0.9166  0.9053  0.9208  0.9089  0.0088  
MAE (testset)     0.6816  0.6767  0

{'test_rmse': array([0.90567327, 0.89620608, 0.91657182, 0.90531585, 0.92081002]),
 'test_mae': array([0.68161864, 0.67674076, 0.69248837, 0.68311319, 0.69005613]),
 'fit_time': (12.891018629074097,
  13.149861574172974,
  12.930994272232056,
  12.918000936508179,
  12.915004253387451),
 'test_time': (7.919735908508301,
  7.507350921630859,
  7.482368469238281,
  8.064008951187134,
  7.743207693099976)}

In [32]:
# Use trainset with the new parameters
bsl_options = {'method': 'sgd',
               'n_epochs': 5,
               'learning_rate': .00001
               }

sim_options = {'name': 'pearson_baseline',
               'user_based': False
              }

algo_KBL = KNNBaseline(k=30, bsl_options=bsl_options, sim_options=sim_options)
algo_KBL.fit(trainset)
test_pred_KBL = algo_KBL.test(testset)
print("Content Based Filtering : Test Set")
accuracy.rmse(test_pred_KBL, verbose=True)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Content Based Filtering : Test Set
RMSE: 0.9116


0.9115814248534567

# Collaborative Filtering

## User-based Collaborative Filtering

In [None]:
param_grid_KWM_UB = {'k': [10, 20, 50], 'n_epochs': [1, 3, 5],
                              'learning_rate': [.00001,.00003,.00005]
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [True]}
              }
gs_KWM_UB = GridSearchCV(KNNWithMeans, param_grid_KWM_UB, measures=['rmse', 'mae'], cv=3)
gs_KWM_UB.fit(data)

In [None]:
# We can now use the algorithm that yields the best rmse:
algo_KWM_UB = gs_KWM_UB.best_estimator['rmse']  # pass the best model to algo
print(gs_KWM_UB.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs_KWM_UB.best_params['rmse'])
cross_validate(algo_KWM_UB, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:

sim_options = {'name': 'pearson_baseline',
               'user_based': False
              }

algo_KWM_U = KNNWithMeans(k=30, sim_options=sim_options)
algo_KBL.fit(trainset)
test_pred_KBL = algo_KBL.test(testset)
print("Content Based Filtering : Test Set")
accuracy.rmse(test_pred_KBL, verbose=True)