In [None]:
import random
import joblib
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import send_status_mail as ssm

In [None]:
"""Some algorithms randomly initialize their parameters (sometimes with numpy), and the cross-validation folds are also randomly generated. 
If you need to reproduce your experiments multiple times, you just have to set the seed of the RNG at the beginning of your program:"""

my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

### Loading data and preparing data

In [None]:
df = joblib.load('../data/processed/preprocessed_data_movielens.pkl')
df.drop(columns=['title','genres','relevance','tag'], inplace=True)
# sort columns in required order
df = df[['userId', 'movieId', 'rating']]
# reset index, which was nonsense after import
df = df.reset_index().drop(columns=['index'])

# Load the data into Surprise format, columns have been sorted in required order (raw user id, raw item id, rating) beforehand
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df, reader)

### Parameter tuning

#### KNN models

In [None]:
# from surprise import KNNBasic

# # define sim_options to be tested
# sim_options = {
# "name": ["msd", "cosine", "pearson", "pearson_baseline"],
# "min_support": [3, 4, 5],
# "user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
# }
# param_grid = {"sim_options": sim_options,
#               "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
#               "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
# gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
# gs.fit(data)
# print(gs.best_score)
# print(gs.best_params)

# # save GridSearchCV object to file in models folder
# joblib.dump(gs, '../models/surp_gridsearchcv_knnBasic.pkl')

# # send completion message via email (server, sender, recepient according to .env)
# ssm.sendstatus("KNNBasic")

In [None]:
# from surprise import KNNWithMeans

# # define sim_options to be tested
# sim_options = {
# "name": ["msd", "cosine", "pearson", "pearson_baseline"],
# "min_support": [3, 4, 5],
# "user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
# }
# param_grid = {"sim_options": sim_options,
#               "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
#               "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
# gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
# gs.fit(data)
# print(gs.best_score)
# print(gs.best_params)

# # save GridSearchCV object to file in models folder
# joblib.dump(gs, '../models/surp_gridsearchcv_knnMeans.pkl')

# # send completion message via email (server, sender, recepient according to .env)
# ssm.sendstatus("KNNWithMeans")

In [None]:
# from surprise import KNNBaseline

# # define sim_options to be tested
# sim_options = {
# "name": ["msd", "cosine", "pearson", "pearson_baseline"],
# "min_support": [3, 4, 5],
# "user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
# }
# param_grid = {"sim_options": sim_options,
#               "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
#               "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
# gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
# gs.fit(data)
# print(gs.best_score)
# print(gs.best_params)

# # save GridSearchCV object to file in models folder
# joblib.dump(gs, '../models/surp_gridsearchcv_knnBaseline.pkl')

# # send completion message via email (server, sender, recepient according to .env)
# ssm.sendstatus("KNNBaseline")

In [None]:
# from surprise import KNNWithZScore

# # define sim_options to be tested
# sim_options = {
# "name": ["msd", "cosine", "pearson", "pearson_baseline"],
# "min_support": [3, 4, 5],
# "user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
# }
# param_grid = {"sim_options": sim_options,
#               "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
#               "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
# gs = GridSearchCV(KNNWithZScore, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
# gs.fit(data)
# print(gs.best_score)
# print(gs.best_params)

# # save GridSearchCV object to file in models folder
# joblib.dump(gs, '../models/surp_gridsearchcv_knnZScore.pkl')

# # send completion message via email (server, sender, recepient according to .env)
# ssm.sendstatus("KNNWithZScore")

#### Matrix factorization models

In [None]:
from surprise import SVD

# define parameters to be tested
param_grid = {"n_factors":[50, 100, 150], # The number of factors. Default is ``100``.
              "n_epochs": [10, 20, 30], # The number of iteration of the SGD procedure. Default is ``20``.
              "biased": [True, False], # Whether to use baselines (or biases). Default is ``True``.
              "lr_all": [0.002, 0.005, 0.01], # lr_all: The learning rate for all parameters. Default is ``0.005``.
              "reg_all": [0.02, 0.05, 0.1], # reg_all: The regularization term for all parameters. Default is ``0.02``.
              "random_state":[42]}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
gs.fit(data)
print(gs.best_score)
print(gs.best_params)

# save GridSearchCV object to file in models folder
joblib.dump(gs, '../models/surp_gridsearchcv_SVD.pkl')

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("SVD")

In [None]:
from surprise import NMF

# define parameters to be tested
param_grid = {"n_factors":[10, 15, 20], # The number of factors. Default is ``15``.
              "n_epochs": [20, 50, 100], # The number of iteration of the SGD procedure. Default is ``50``.
              "biased": [True, False], # Whether to use baselines (or biases). Default is ``False``.
              "reg_pu": [0.06, 0.08, 0.1], # The regularization term for users lambda_u. Default is ``0.06``.
              "reg_qi": [0.06, 0.08, 0.1], # The regularization term for items lambda_i. Default is ``0.06``.
              "random_state":[42]}

gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mse", "mae"], cv=3, n_jobs=-1)
gs.fit(data)
print(gs.best_score)
print(gs.best_params)

# save GridSearchCV object to file in models folder
joblib.dump(gs, '../models/surp_gridsearchcv_NMF.pkl')

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("NMF")