In [21]:
import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise import Reader
from surprise.model_selection import KFold
import random
import math

In [38]:
# c.26m user-movie ratings
ratings_df = pd.read_csv("ratings.csv")

In [39]:
# shuffle dataframe and then split out some data for unbiased testing
ratings_df.drop(['timestamp'], axis = 1, inplace = True)
ratings_df = ratings_df.sample(frac=1).reset_index(drop=True)
test_train_split = math.floor(len(ratings_df)*.8)
train = ratings_df[:test_train_split]
test = ratings_df[test_train_split:]

In [42]:
# using surprise package: initialise reader with 1 - 5 scale, put trainset into Suprise 'wrapper'
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train, reader)
algo = SVD()

In [43]:
# get a baseline RSME score with default parameters
trainset, testset = train_test_split(data, test_size=.25)
algo.fit(trainset)
predictions = algo.test(testset)

In [51]:
accuracy.rmse(predictions, verbose = True)

RMSE: 0.8123


0.8123478193214823

In [52]:
# Attempt to tune parameters to improve RMSE 
# it's computationally expensive so limited ability to finely tune, but
# number of factors is the obvious improvement opporunity
param_grid = {'n_factors': [10,25,50,150]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], n_jobs = -1, cv=3, refit = True)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
gscv_results_df = pd.DataFrame.from_dict(gs.cv_results)



0.8178917027638771
{'n_factors': 50}


In [54]:
# The highest mean test RMSE was with 50 factors - but interestingly this did not improve on the baseline score 
# with default parameters.
# Further parameter tuning - for example, experimenting with the learning rate of number of epochs, 
# may yield improved results.
# However we can note that an RMSE of 0.82 compares well with, for example, the orignal netflix prize of 2009, RMSE 0.86
# The computational expense of GridSearchCV sadly meant that further tuning of parameters was not possible
# For this reason confirmation of generalisation of the results through a test of unseen data was also not completed
gscv_results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors
0,0.822747,0.824188,0.823705,0.823546,0.000599,4,536.813839,66.149059,343.545335,74.102112,{'n_factors': 10},10
1,0.818293,0.819465,0.818463,0.81874,0.000517,2,839.940263,181.062887,891.278536,416.586979,{'n_factors': 25},25
2,0.817391,0.818302,0.817982,0.817892,0.000378,1,1052.24749,124.973571,922.822843,531.629237,{'n_factors': 50},50
3,0.819878,0.820911,0.82006,0.820283,0.00045,3,2269.480941,560.156583,115.386146,30.535429,{'n_factors': 150},150
