In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from surprise import SVD
from surprise import BaselineOnly
from surprise import SlopeOne
from surprise import NormalPredictor
from surprise import Reader
from surprise import Dataset

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy

## Loading the data set

In [2]:
df = pd.read_pickle('df_rats.pkl')

In [3]:
df

Unnamed: 0,user_id,book_id,rating,best_book_id
0,8842281e1d1347389f2ab93d60773d4d,22034,5,22034
1,bafc2d50014200cda7cb2b6acd60cd73,22034,5,22034
2,0ef32090550901ead25cb0ea21c4d36b,22034,4,22034
3,8489357f2f485c1a961d4cbdc54ea84b,22034,3,22034
4,623103c8b74b4e97b2077ff2fd33514b,22034,4,22034
...,...,...,...,...
10505812,f099c0c488290d550eff5200c64a3094,17853378,5,17853378
10505813,249f280eaa2398cab3e64faa2fd5f236,5709901,3,5709901
10505814,be6f3f764f0a4b665673671f177949d1,16075973,4,16075973
10505815,b82f3bff7672ffb52a0da25a9194c79d,6687280,4,6687280


In [4]:
df.drop(columns=['best_book_id'],inplace = True)

In [5]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

In [7]:
train,test = train_test_split(data, test_size=0.2, random_state=47)

In [8]:
predict_NP = NormalPredictor().fit(train).test(test)

In [11]:
accuracy.rmse(predict_NP)

RMSE: 1.7230


1.7230345283942123

In [12]:
predict_bo_als = BaselineOnly().fit(train).test(test)

Estimating biases using als...


In [13]:
accuracy.rmse(predict_bo_als)

RMSE: 1.1219


1.121923214371969

In [15]:
predict_bo_sgd = BaselineOnly(bsl_options = {'method': 'sgd'}).fit(train).test(test)

Estimating biases using sgd...


In [16]:
accuracy.rmse(predict_bo_sgd)

RMSE: 1.1200


1.1199682065610777

In [17]:
predict_svd = SVD().fit(train).test(test)

In [18]:
accuracy.rmse(predict_svd)

RMSE: 1.1199


1.1198826510943596

In [19]:
algorithms = ['NormalPredictor', 'BaselineOnly_als', 'BaselineOnly_sgd', 'SVD']

In [22]:
RMSE = [accuracy.rmse(predict_NP),accuracy.rmse(predict_bo_als),accuracy.rmse(predict_bo_sgd),accuracy.rmse(predict_svd)]

RMSE: 1.7230
RMSE: 1.1219
RMSE: 1.1200
RMSE: 1.1199


In [23]:
MSE = [accuracy.mse(predict_NP),accuracy.mse(predict_bo_als),accuracy.mse(predict_bo_sgd),accuracy.mse(predict_svd)]

MSE: 2.9688
MSE: 1.2587
MSE: 1.2543
MSE: 1.2541


In [24]:
MAE = [accuracy.mae(predict_NP),accuracy.mae(predict_bo_als),accuracy.mae(predict_bo_sgd),accuracy.mae(predict_svd)]

MAE:  1.3388
MAE:  0.8153
MAE:  0.8190
MAE:  0.8019


In [34]:
metrics = pd.DataFrame(index=algorithms)

In [37]:
metrics[0] = RMSE
metrics[1] = MSE
metrics[2] = MAE

In [39]:
metrics.columns = ['RMSE', 'MSE', 'MAE']

In [40]:
metrics

Unnamed: 0,RMSE,MSE,MAE
NormalPredictor,1.723035,2.968848,1.338751
BaselineOnly_als,1.121923,1.258712,0.815304
BaselineOnly_sgd,1.119968,1.254329,0.818951
SVD,1.119883,1.254137,0.801898


In [41]:
metrics.to_csv('metricsfile.csv')

BaselineOnly and SVD are very close performers, but BaselineOnly is much less computataionlly expensive so we will move forward with that one. 

In [11]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

In [12]:
train,test = train_test_split(data, test_size=0.2, random_state=47)

In [None]:
# Tested: n_epochs 12,18 and reg 0.02, 0.01. both had RMSE of 1.12, higher than standard params

In [75]:
bsl_options = {'method': 'sgd',
               'reg':0.5,
               'learning_rate': .005,
               'n_epochs': 25
               }

In [76]:
model_sgd = BaselineOnly(bsl_options = bsl_options).fit(train)

Estimating biases using sgd...


In [77]:
preds = model_sgd.test(test)

In [78]:
parms_500525 = accuracy.rmse(preds)

RMSE: 1.1331


In [79]:
parms_500525

1.1330510475375668

In [38]:
Standard_params = accuracy.rmse(preds)

RMSE: 1.1219


In [44]:
parms_010512

1.1578101439212658

In [50]:
parms_0100512

1.1888492258562413

In [56]:
parms_0100520

1.1200360449938294

In [62]:
parms_0100525

1.1184975433756525

In [68]:
parms_0200525

1.118403698114278

In [74]:
parms_0300525

1.1183417981586508

In [80]:
parms_500525

1.1330510475375668

In [81]:
parms_0300525

1.1183417981586508

In [82]:
bsl_best = {'method': 'sgd',
               'reg':0.03,
               'learning_rate': .005,
               'n_epochs': 25
               }

In [None]:
train1,test1 = train_test_split(data, test_size=0.2, random_state=40)
train2,test2 = train_test_split(data, test_size=0.2, random_state=13)
train3,test3 = train_test_split(data, test_size=0.2, random_state=25)

In [None]:
algo_best = BaselineOnly(bsl_best = bsl_options)

In [None]:
model = algo_best.fit(train1)

In [None]:
predictions = model.test(test1)

In [None]:
rmse = accuracy.rmse(predictions)
mse = accuracy.mse(predictions)
mae = accuracy.mae(predictions)
fcp = accuracy.fcp(predictions)