In [1]:
import os
import itertools
import pickle

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader

In [2]:
# Load the data set from file
file_path = os.path.join("crawl-ratings", "train.csv")
reader = Reader(
    line_format="user item rating", sep=",", rating_scale=(1, 10), skip_lines=1
)
data = Dataset.load_from_file(file_path, reader=reader)

# sample random trainset and validation set
# val set is made of 25% of the ratings.
train_set, validation_set = train_test_split(data, test_size=0.25)

In [3]:
%%timeit -n1 -r1

# We'll use the famous SVD algorithm.
algo = SVD(verbose=True, n_epochs=5)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(train_set)
predictions = algo.test(validation_set)

# Then compute RMSE
accuracy.rmse(predictions)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
RMSE: 1.5539
12.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [4]:
n_factors_list = [10, 50, 100, 150, 200]  # The number of factors, default value is 100
reg_all_list = [
    0.01,
    0.02,
    0.05,
    0.1,
]  # The regularization term for all params, default value is 0.02

n_factors_best = reg_all_best = model_best = None
rmse_best = float("inf")
rmse_list = []


for n_factors, reg_all in itertools.product(n_factors_list, reg_all_list):
    algo = SVD(n_factors=n_factors, reg_all=reg_all)
    algo.fit(train_set)
    predictions = algo.test(validation_set)
    
    rmse = accuracy.rmse(predictions)
    rmse_list.push(rmse)
    print(f"Rmse for {n_factors} factors with {reg_all} regesion is {rmse}")
    if rmse < rmse_best:
        n_factors_best, reg_all_best, model_best, rmse_best = n_factors, reg_all, algo, rmse

RMSE: 1.5640
Rmse for 10 factors with 0.01 regesion is 1.5639506166744508
RMSE: 1.5478
Rmse for 10 factors with 0.02 regesion is 1.5478471384606212
RMSE: 1.5339
Rmse for 10 factors with 0.05 regesion is 1.5339364091657888
RMSE: 1.5320
Rmse for 10 factors with 0.1 regesion is 1.532036507342005
RMSE: 1.6135
Rmse for 50 factors with 0.01 regesion is 1.6134984860809087
RMSE: 1.5748
Rmse for 50 factors with 0.02 regesion is 1.5747994590649443
RMSE: 1.5275
Rmse for 50 factors with 0.05 regesion is 1.527503421721313
RMSE: 1.5225
Rmse for 50 factors with 0.1 regesion is 1.5225197964833335
RMSE: 1.6170
Rmse for 100 factors with 0.01 regesion is 1.6169584533493377
RMSE: 1.5756
Rmse for 100 factors with 0.02 regesion is 1.575568782232622
RMSE: 1.5233
Rmse for 100 factors with 0.05 regesion is 1.5233186643222136
RMSE: 1.5205
Rmse for 100 factors with 0.1 regesion is 1.5205212004239448
RMSE: 1.6106
Rmse for 150 factors with 0.01 regesion is 1.6105891839006694
RMSE: 1.5682
Rmse for 150 factors with 

In [5]:
file_name = "matrix_decomposition_model.pkl"
with open(file_name, "wb") as f:
    pickle.dump(model_best, f)

In [6]:
loaded_model = None

with open(file_name, "rb") as f:
    loaded_model = pickle.load(f)

In [7]:
accuracy.rmse(loaded_model.test(validation_set))

RMSE: 1.5178


1.5178187544066526

In [8]:
print(model_best)

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f778a8db050>
