In [1]:
from surprise import SVD, Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from surprise import accuracy
from surprise import dump
import numpy as np
import pandas as pd
import csv

In [2]:
my_seed = 12345
np.random.seed(my_seed)

In [3]:
origin_data = pd.read_csv('../Data/final_rating_not_spare.csv', usecols=['customer_id', 'product_id', 'stars'])
# origin_data.to_csv('../Data/final_rating_not_spare_format.csv', index=False)

In [4]:
origin_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20849 entries, 0 to 20848
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   product_id   20849 non-null  int64
 1   customer_id  20849 non-null  int64
 2   stars        20849 non-null  int64
dtypes: int64(3)
memory usage: 488.8 KB


In [5]:
file_path = '../Data/final_rating_not_spare_format.csv'
reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,5), skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)

In [6]:
trainset, testset = train_test_split(data, test_size=0.2)
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  4899 

Number of items:  1382 



In [7]:
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = list(map(iid_converter, trainset_iids))

In [8]:
trainsetfull = data.build_full_trainset()
print('Number of users: ', trainsetfull.n_users, '\n')
print('Number of items: ', trainsetfull.n_items, '\n')

Number of users:  5289 

Number of items:  1382 



In [9]:
# name_converter = NameConverter('games_master_list.csv')


In [10]:
param_grid_2 = {'n_factors':[5, 10,20],'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs_model_2 = GridSearchCV(SVD,param_grid=param_grid_2, n_jobs = -1, joblib_verbose=5)
gs_model_2.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  2.2min finished


In [11]:
gs_model_2.best_params

{'rmse': {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4},
 'mae': {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}}

In [12]:
svd2 = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.6)
svd2.fit(trainset)
predictions = svd2.test(testset)
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))
print(accuracy.fcp(predictions))

RMSE: 0.5694
0.5693537623338409
MAE:  0.3403
0.3403169960157169
FCP:  0.6259
0.6259430244341853


In [20]:
param_grid_3 = {'n_factors':[20],'n_epochs': [20], 'lr_all': [0.004, 0.005, 0.006],
              'reg_all': [0.5, 0.6, 0.7]}

gs_model_3 = GridSearchCV(SVD,param_grid=param_grid_3,n_jobs = -1,joblib_verbose=5)
gs_model_3.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   33.0s finished


In [21]:
gs_model_3.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.5},
 'mae': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.5}}

In [22]:
param_grid_4 = {'n_factors':[20, 30],'n_epochs': [20], 'lr_all': [0.006, 0.01, 0.015],
              'reg_all': [0.7, 1, 1.5]}

gs_model_4 = GridSearchCV(SVD,param_grid=param_grid_4,n_jobs = -1,joblib_verbose=5)
gs_model_4.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.1min finished


In [23]:
gs_model_4.best_params

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.7},
 'mae': {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.015, 'reg_all': 0.7}}

In [24]:
gs_model_4.best_score

{'rmse': 0.5765428841633911, 'mae': 0.3339240321196649}

In [25]:
param_grid_5 = {'n_factors':[30, 40, 50],'n_epochs': [20], 'lr_all': [0.006],
              'reg_all': [0.7]}

gs_model_5 = GridSearchCV(SVD,param_grid=param_grid_5,n_jobs = -1,joblib_verbose=5)
gs_model_5.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   10.7s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   12.8s finished


In [26]:
gs_model_5.best_params


{'rmse': {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7},
 'mae': {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.7}}

In [27]:
gs_model_5.best_score


{'rmse': 0.5780421317058986, 'mae': 0.342319049898542}

In [28]:
chosen_SVD = SVD(n_factors= 50, n_epochs=20, lr_all=0.006, reg_all=0.7)
chosen_SVD.fit(trainset)
predictions = chosen_SVD.test(testset)
accuracy.rmse(predictions)

RMSE: 0.5711


0.5710864674663195

# SVD++

In [29]:
chosen_SVDpp = SVDpp()
chosen_SVDpp.fit(trainset)
predictions = chosen_SVDpp.test(testset)
accuracy.rmse(predictions)

RMSE: 0.5531


0.5531276180981511

In [30]:
param_grid_6 = {'n_factors':[20, 30],'n_epochs': [20], 'lr_all': [0.006, 0.01],
              'reg_all': [0.7, 1]}

gs_model_6 = GridSearchCV(SVDpp,param_grid=param_grid_6,n_jobs = -1,joblib_verbose=5)
gs_model_6.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished


In [32]:
from surprise import SlopeOne, CoClustering

In [33]:
algo = SlopeOne()
results = cross_validate(algo, data, measures = ['RMSE'], return_train_measures=True)
results

{'test_rmse': array([0.73615277, 0.73930098, 0.68511364, 0.69052932, 0.68721024]),
 'train_rmse': array([0.11356827, 0.11360312, 0.11480163, 0.11959884, 0.11434316]),
 'fit_time': (0.14502644538879395,
  0.1779944896697998,
  0.1709897518157959,
  0.19899797439575195,
  0.17699718475341797),
 'test_time': (0.06597113609313965,
  0.47200536727905273,
  0.07200241088867188,
  0.09300088882446289,
  0.08100724220275879)}

In [34]:
results


{'test_rmse': array([0.73615277, 0.73930098, 0.68511364, 0.69052932, 0.68721024]),
 'train_rmse': array([0.11356827, 0.11360312, 0.11480163, 0.11959884, 0.11434316]),
 'fit_time': (0.14502644538879395,
  0.1779944896697998,
  0.1709897518157959,
  0.19899797439575195,
  0.17699718475341797),
 'test_time': (0.06597113609313965,
  0.47200536727905273,
  0.07200241088867188,
  0.09300088882446289,
  0.08100724220275879)}

In [35]:
slopeone = SlopeOne()
slopeone.fit(trainset)
predictions = slopeone.test(testset)
accuracy.rmse(predictions)

RMSE: 0.7094


0.7094171780294541

In [36]:
algo = CoClustering(n_cltr_u = 3, n_cltr_i = 3)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6669


0.6669497462688944

In [37]:
algo = CoClustering(n_cltr_u = 5, n_cltr_i = 5)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6944


0.6944212650927531