In [144]:
import pandas as pd
import numpy as np
import time
from surprise import SVD, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import dump
     

In [145]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainDataset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testDataset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainDataset.build_full_trainset()
    testset = (testDataset.build_full_trainset()).build_testset()
    return trainDataset, testDataset, trainset, testset

In [146]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainDataset, testDataset, trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [147]:
traindf.head(5)

Unnamed: 0,userId,movieId,title,rating
0,1,1371,Rocky III,2.5
1,1,2105,American Pie,4.0
2,1,2193,My Tutor,2.0
3,1,2294,Jay and Silent Bob Strike Back,2.0
4,2,62,2001: A Space Odyssey,3.0


In [148]:
def recommendation(algo, trainset, testset):
  # Train the algorithm on the trainset, and predict ratings for the testset
  start_fit = time.time()
  algo.fit(trainset)
  end_fit = time.time()
  fit_time = end_fit - start_fit

  # Predictions on testing set
  start_test = time.time()
  test_predictions = algo.test(testset)
  end_test = time.time()
  test_time = end_test - start_test

  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)
  
  return test_rmse, test_mae, test_predictions, fit_time, test_time

In [149]:

svd_algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
test_rmse, test_mae, test_predictions, fit_time, test_time = recommendation(svd_algo,trainset,testset)
print('fit time: '+str(fit_time))
print('test time: '+str(test_time))

RMSE: 0.8977
MAE:  0.6890
fit time: 0.23482656478881836
test time: 0.03498387336730957


Fine tuning Hyperparameters:

In [150]:
traindf.head(5)

Unnamed: 0,userId,movieId,title,rating
0,1,1371,Rocky III,2.5
1,1,2105,American Pie,4.0
2,1,2193,My Tutor,2.0
3,1,2294,Jay and Silent Bob Strike Back,2.0
4,2,62,2001: A Space Odyssey,3.0


In [151]:
param_grid = {'n_factors':[25, 50, 100], 'n_epochs': [5, 10, 20], 'lr_all': [0.01, 0.02],
              'reg_all': [0.01, 0.02]}


gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)

gs.fit(trainDataset)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# best RMSE score
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['mae'])

0.8976765913599326
{'n_factors': 25, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.02}
0.6935297660283708
{'n_factors': 25, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.02}


In [152]:
svd_algo = gs.best_estimator["rmse"]
test_rmse, test_mae, test_predictions, fit_time, test_time = recommendation(svd_algo,trainset,testset)
print('fit time: '+str(fit_time))
print('test time: '+str(test_time))

RMSE: 0.8882
MAE:  0.6799
fit time: 0.0638267993927002
test time: 0.02873849868774414


In [153]:
file_name = 'svd_model'
dump.dump(file_name, predictions=test_predictions, algo=svd_algo)