In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

location = "../dataset/clothing_data_processed.csv"
clothing_data = pd.read_csv(location)
clothing_data = clothing_data[['user_id', 'item_id', 'rating']]
clothing_data

Unnamed: 0,user_id,item_id,rating
0,420272,2260466,10.0
1,273551,153475,10.0
2,360448,1063761,10.0
3,909926,126335,8.0
4,151944,616682,10.0
...,...,...,...
192539,66386,2252812,10.0
192540,118398,682043,10.0
192541,47002,683251,6.0
192542,961120,126335,10.0


In [2]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(0,10)) 
data = Dataset.load_from_df(clothing_data,reader)

In [3]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors':[10, 11],
    'reg_all':[0.07, 0.1],
    'init_std_dev': [0.01, 0.008],
    'lr_all': [0.015, 0.013, 0.011],
    'random_state': [10]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose = 10, n_jobs = -1)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   44.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  1.4min remaining:    9.1s


1.3943619304107109
{'n_factors': 11, 'reg_all': 0.1, 'init_std_dev': 0.01, 'lr_all': 0.011, 'random_state': 10}


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.5min finished


In [4]:
location = "../dataset/train.csv"
train_data = pd.read_csv(location)
train_data = train_data[['user_id', 'item_id', 'rating']]
train_data

Unnamed: 0,user_id,item_id,rating
0,379417,658706,10.0
1,904971,182915,10.0
2,163215,1979533,10.0
3,232640,880935,8.0
4,691117,180014,10.0
...,...,...,...
154030,733207,2358935,8.0
154031,873762,625911,10.0
154032,14227,1515649,8.0
154033,421813,127081,8.0


In [10]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(0,10)) 
train_svd = Dataset.load_from_df(train_data, reader)

In [11]:
from surprise.model_selection import train_test_split
from surprise import accuracy

algo = SVD(n_factors = 11, reg_all = 0.1, init_std_dev= 0.01, lr_all= 0.011, random_state = 10)
train_svd = train_svd.build_full_trainset()
algo.fit(train_svd)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17e362ca8e0>

In [12]:
location = "../dataset/test.csv"
test_data = pd.read_csv(location)
test_data = test_data[['user_id', 'item_id', 'rating']]
test_data

Unnamed: 0,user_id,item_id,rating
0,492205,208647,8.0
1,596658,145906,8.0
2,88342,1740815,10.0
3,501057,2803807,10.0
4,662466,2521411,8.0
...,...,...,...
38504,884008,172027,10.0
38505,437160,130259,8.0
38506,425211,416213,8.0
38507,970560,1146825,10.0


In [13]:
from sklearn.metrics import mean_squared_error
predictions = []
for index, row in test_data.iterrows():
    rating = algo.predict(row['user_id'], row['item_id'])
    predictions.append(rating.est)

mean_squared_error(test_data['rating'], predictions)

1.9430161896604647