In [8]:
import numpy as np
import pandas as pd
import pickle
from source.utils import train_test_split_feature
from surprise import Reader, Dataset
from surprise import BaselineOnly
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans
from surprise.prediction_algorithms.matrix_factorization import SVD, NMF
from surprise.model_selection import GridSearchCV

In [2]:
feature = pd.read_csv("data/feature.csv")
cols = [
        "user_id",
        'business_id',
        "review_stars",
        "review_date"
        ]
selected_feature = feature[cols]
train_set, test_set = train_test_split_feature(selected_feature.copy())
reader = Reader(rating_scale=(1, 5))
train_set = Dataset.load_from_df(train_set[['user_id', 'business_id', 'review_stars']], reader)
test_set = Dataset.load_from_df(test_set[['user_id', 'business_id', 'review_stars']], reader)
full_set = Dataset.load_from_df(selected_feature[['user_id', 'business_id', 'review_stars']], reader)
trainset = train_set.build_full_trainset()
testset = test_set.build_full_trainset().build_testset()

In [None]:
# kNN
result = []
for method in ['cosine', 'msd', 'pearson', 'pearson_baseline']:
    for k_selection in [20, 40]:
        for user_flag in [False]:
            model = KNNBasic(k=k_selection, sim_options={"name": method, "user_based": user_flag})
            model.fit(trainset)
            pred = model.test(testset)
            result.append([method, k_selection, user_flag, accuracy.rmse(pred), accuracy.mae(pred)])
result_pickle = "../result/knnbasic_hyperparameter_tuning.pkl"
cv_result = pd.DataFrame(result)
with open(result_pickle, 'wb') as handle:
    pickle.dump(cv_result, handle)
result = []
for method in ['cosine', 'msd', 'pearson', 'pearson_baseline']:
    for k_selection in [20, 40]:
        for user_flag in [False]:
            model = KNNWithMeans(k=k_selection, sim_options={"name": method, "user_based": user_flag})
            model.fit(trainset)
            pred = model.test(testset)
            result.append([method, k_selection, user_flag, accuracy.rmse(pred), accuracy.mae(pred)])
result_pickle = "../result/knnwithmeans_hyperparameter_tuning.pkl"
cv_result = pd.DataFrame(result)
with open(result_pickle, 'wb') as handle:
    pickle.dump(cv_result, handle)

In [4]:
# SVD
param_grid = {'n_factors': [50, 100, 200], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.01, 0.02, 0.1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=3)
grid_search.fit(train_set)
result_pickle = "result/svd_hyperparameter_tuning.pkl"
cv_result = pd.DataFrame(grid_search.cv_results)
with open(result_pickle, 'wb') as handle:
    pickle.dump(cv_result, handle)
print(f"The best RMSE is: {grid_search.best_score['rmse']}")
print(f"The best hyperparameters for this RMSE is: {grid_search.best_params['rmse']}")
print(f"The best MAE is: {grid_search.best_score['mae']}")
print(f"The best hyperparameters for this MAE is: {grid_search.best_params['mae']}")

KeyError: ('param_bsl_options', 'occurred at index 0')

The best RMSE is: 1.1316703167438933
The best hyperparameters for this RMSE is: {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.1}
The best MAE is: 0.8933017631499832
The best hyperparameters for this MAE is: {'n_factors': 50, 'lr_all': 0.01, 'reg_all': 0.1}


In [None]:
# NMF
param_grid = {'n_factors': [10, 25, 50], 'reg_pu': [0.01, 0.06, 0.1],
              'reg_qi': [0.01, 0.06, 0.1]}
grid_search = GridSearchCV(NMF, param_grid, measures=['rmse','mae'], cv=3)
grid_search.fit(train_set)
result_pickle = "result/nmf_hyperparameter_tuning.pkl"
cv_result = pd.DataFrame(grid_search.cv_results)
with open(result_pickle, 'wb') as handle:
    pickle.dump(cv_result, handle)
print(f"The best RMSE is: {grid_search.best_score['rmse']}")
print(f"The best hyperparameters for this RMSE is: {grid_search.best_params['rmse']}")
print(f"The best MAE is: {grid_search.best_score['mae']}")
print(f"The best hyperparameters for this MAE is: {grid_search.best_params['mae']}")