In [46]:
import os
import itertools
import pickle

import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader

In [2]:
# Load the data set from file
file_path = os.path.join("crawl-ratings", "train.csv")
reader = Reader(line_format="user item rating",
                sep=",",
                rating_scale=(1, 10),
                skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)

# sample random trainset and validation set
# val set is made of 25% of the ratings.
train_set, validation_set = train_test_split(data, test_size=0.25, random_state=0)

In [3]:
%%timeit -n1 -r1

# We'll use the famous SVD algorithm.
algo = SVD(verbose=True, n_epochs=5)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(train_set)
predictions = algo.test(validation_set)

# Then compute RMSE
accuracy.rmse(predictions)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
RMSE: 1.5569
13.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [4]:
n_factors_list = [10, 50, 100, 150,
                  200]  # The number of factors, default value is 100
reg_all_list = [
    0.01,
    0.02,
    0.05,
    0.1,
]  # The regularization term for all params, default value is 0.02

n_factors_best = reg_all_best = model_best = None
rmse_best = float("inf")
rmse_list = []

for n_factors, reg_all in itertools.product(n_factors_list, reg_all_list):
    algo = SVD(n_factors=n_factors, reg_all=reg_all)
    algo.fit(train_set)
    predictions = algo.test(validation_set)

    rmse = accuracy.rmse(predictions)
    rmse_list.append(rmse)
    print(f"Rmse for {n_factors} factors with {reg_all} regesion is {rmse}")
    if rmse < rmse_best:
        n_factors_best, reg_all_best, model_best, rmse_best = n_factors, reg_all, algo, rmse

RMSE: 1.5662
Rmse for 10 factors with 0.01 regesion is 1.5662058102415837
RMSE: 1.5467
Rmse for 10 factors with 0.02 regesion is 1.5466890582769512
RMSE: 1.5294
Rmse for 10 factors with 0.05 regesion is 1.5294474670003726
RMSE: 1.5326
Rmse for 10 factors with 0.1 regesion is 1.5326316573629475
RMSE: 1.6145
Rmse for 50 factors with 0.01 regesion is 1.614496881519193
RMSE: 1.5767
Rmse for 50 factors with 0.02 regesion is 1.5767087967250086
RMSE: 1.5265
Rmse for 50 factors with 0.05 regesion is 1.526468134266003
RMSE: 1.5249
Rmse for 50 factors with 0.1 regesion is 1.5248749218540365
RMSE: 1.6172
Rmse for 100 factors with 0.01 regesion is 1.617223977555844
RMSE: 1.5768
Rmse for 100 factors with 0.02 regesion is 1.5768020006742227
RMSE: 1.5239
Rmse for 100 factors with 0.05 regesion is 1.5239266282412918
RMSE: 1.5205
Rmse for 100 factors with 0.1 regesion is 1.5205153418325743
RMSE: 1.6128
Rmse for 150 factors with 0.01 regesion is 1.612800238584212
RMSE: 1.5703
Rmse for 150 factors with 0

In [5]:
file_name = "matrix_decomposition_model.pkl"
with open(file_name, "wb") as f:
    pickle.dump(model_best, f)

In [6]:
loaded_model = None

with open(file_name, "rb") as f:
    loaded_model = pickle.load(f)

In [7]:
accuracy.rmse(loaded_model.test(validation_set))

RMSE: 1.5199


1.5198517442645045

In [8]:
print(model_best)

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fa254be5f10>


In [40]:
user_rating = Dataset.load_from_file(os.path.join("new_user", "rating_all.csv"), reader)
all_train_set = user_rating.build_full_trainset()

In [41]:
algo.fit(all_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa254be5f10>

In [44]:
algo.predict("ur26266323","tt0310281")

Prediction(uid='ur26266323', iid='tt0310281', r_ui=None, est=7.126198497368478, details={'was_impossible': False})

In [49]:
all_movies_df = pd.read_csv(os.path.join("crawl-movies-info", "movie-idx.csv"))

In [50]:
pred = [algo.predict("u0", movieId) for movieId in all_movies_df["MovieID"]]

In [65]:
pred.sort(key=lambda x:x.est, reverse=True)

In [66]:
pred[:100]

[Prediction(uid='u0', iid='tt2103188', r_ui=None, est=8.794842040254897, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt5275892', r_ui=None, est=8.558307170897388, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt0397042', r_ui=None, est=8.545421519855788, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt0071075', r_ui=None, est=8.51977031660955, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt1877514', r_ui=None, est=8.518343533446552, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt1568322', r_ui=None, est=8.510734349419534, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt1479962', r_ui=None, est=8.50275728962319, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt5491994', r_ui=None, est=8.493235941207915, details={'was_impossible': False}),
 Prediction(uid='u0', iid='tt0081846', r_ui=None, est=8.485882723513075, details={'was_impossible': False}),
 Prediction(uid='u0',