In [2]:
import random
import joblib
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
import pandas as pd
import numpy as np

In [3]:
"""Some algorithms randomly initialize their parameters (sometimes with numpy), and the cross-validation folds are also randomly generated. 
If you need to reproduce your experiments multiple times, you just have to set the seed of the RNG at the beginning of your program:"""

my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

### Loading data and preparing data

In [22]:
df = joblib.load('../data/processed/preprocessed_data_movielens.pkl')
df.drop(columns=['title','genres','relevance','tag'], inplace=True)
# sort columns in required order
df = df[['userId', 'movieId', 'rating']]
# reset index, which was nonsense after import
df = df.reset_index().drop(columns=['index'])

In [23]:
# Load the data into Surprise format, columns have been sorted in required order (raw user id, raw item id, rating) beforehand
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df, reader)

### Parameter tuning

In [None]:
from surprise import KNNBaseline

# define sim_options to be tested
sim_options = {
"name": ["msd", "cosine", "pearson", "pearson_baseline"],
"min_support": [3, 4, 5],
"user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
}
param_grid = {"sim_options": sim_options,
              "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
              "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
gs.fit(data)
print(gs.best_score)
print(gs.best_params)

# save GridSearchCV object to file in models folder
joblib.dump(gs, '../models/surp_gridsearchcv_knnBaseline.pkl')

In [None]:
from surprise import KNNWithZScore

# define sim_options to be tested
sim_options = {
"name": ["msd", "cosine", "pearson", "pearson_baseline"],
"min_support": [3, 4, 5],
"user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
}
param_grid = {"sim_options": sim_options,
              "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
              "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
gs = GridSearchCV(KNNWithZScore, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
gs.fit(data)
print(gs.best_score)
print(gs.best_params)

# save GridSearchCV object to file in models folder
joblib.dump(gs, '../models/surp_gridsearchcv_knnZScore.pkl')