In [1]:
import pandas as pd
import pickle

from surprise import KNNBasic, Dataset, Reader
from surprise.model_selection import GridSearchCV

In [2]:
df_reviews = pd.read_json('../../../data/interim/final/reviews.json.gz', orient="records", compression="gzip")

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_reviews[['user_id', 'product_id', 'ratings']], reader=reader)

In [4]:
param_grid = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [1, 2],
    },
    'k': [2, 4],
    'sim_options': {
        'name': ['cosine'],
        'min_support': [1, 5],
        'user_based': [False],
    },
    'verbose': [False]
}

gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse"], cv=3)
gs.fit(data)
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])
print(gs.best_estimator["rmse"])

1.1962221645213416
{'bsl_options': {'method': 'als', 'reg': 1}, 'k': 2, 'sim_options': {'name': 'cosine', 'min_support': 5, 'user_based': False}, 'verbose': False}
<surprise.prediction_algorithms.knns.KNNBasic object at 0x000002326A215F10>


In [5]:
trainset = data.build_full_trainset()

model = gs.best_estimator["rmse"]
model.fit(trainset)

<surprise.prediction_algorithms.knns.KNNBasic at 0x2326a215f10>

In [6]:
# The ratings are all the ratings that are in the trainset
# testset = trainset.build_testset()

# The ratings are all the ratings that are not in the trainset,
# anti_testset = trainset.build_anti_testset(0)

In [7]:
pickle.dump(model, open('../../../models/item_based_collaborative_filtering/knn.pkl', 'wb'))