In [1]:
import sys
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd
import random

In [2]:
from surprise.model_selection import train_test_split
from surprise import accuracy, Dataset, SVD
from surprise.model_selection import GridSearchCV
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

In [3]:
print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.9.13 (main, Aug 25 2022, 23:26:10) 
[GCC 11.2.0]
Surprise version: 1.1.3


In [4]:
# Select MovieLens data size: 100k, Other avaialble sizes are 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

In [5]:
data = Dataset.load_builtin("ml-100k")
data_orig = data

In [6]:
data_orig

<surprise.dataset.DatasetAutoFolds at 0x7fcc4085fbb0>

In [7]:
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)

# train = 90% of the data, test = 10% of the data
threshold = int(0.9 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

data.raw_rating = train_raw_ratings

In [8]:
# train, test = python_random_split(data, 0.75)
# trainset, testset = train_test_split(data, test_size=0.25)

In [9]:
# trainset = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()

# print(trainset)

In [10]:
# svd = SVD()

param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

with Timer() as train_time:
    gs.fit(data)
#     svd.fit(train_set)

# best RMSE score
print('best RMSE score', gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print('best parameters ', gs.best_params["rmse"])

print("Took {} seconds for training.".format(train_time.interval))

# use the best model,  retrain on the whole set train
svd_best= gs.best_estimator["rmse"]
trainset = data.build_full_trainset()
svd_best.fit(trainset)

best RMSE score 0.964084292503233
best parameters  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
Took 32.96648699099751 seconds for training.


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcc864fa370>

In [12]:
# now test on the trainset                                                 
testset = data.construct_testset(train_raw_ratings)                     
predictions_train = svd_best.test(testset)                                           
print('Accuracy on the trainset:')                                         
accuracy.rmse(predictions_train) 
# testset

Accuracy on the trainset:
RMSE: 0.9410


0.9409658895633729

In [15]:
# Compute unbiased accuracy on testset
testset = data.construct_testset(test_raw_ratings)  # testset is now the set B
predictions_test = svd_best.test(testset)
print("Unbiased accuracy on testset,", end=" ")
accuracy.rmse(predictions_test)
accuracy.mse(predictions_test)
# testset

Unbiased accuracy on testset, RMSE: 0.9423
MSE: 0.8880


0.8880168555611991