### Set random seed and relevant measure

In [None]:
"""Some algorithms randomly initialize their parameters (sometimes with numpy), and the cross-validation folds are also randomly generated. 
If you need to reproduce your experiments multiple times, you just have to set the seed of the RNG at the beginning of your program:"""
import random
import numpy as np
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

# define measure for which GridSearch results will be retrieved
measure = 'mae'


### Import data

In [None]:
import send_status_mail as ssm
import joblib
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

df = joblib.load('../data/processed/preprocessed_data_movielens.pkl')
df.drop(columns=['title','genres','relevance','tag'], inplace=True)
# sort columns in required order
df = df[['userId', 'movieId', 'rating']]
# reset index, which was nonsense after import
df = df.reset_index().drop(columns=['index'])

# Load the data into Surprise format, columns have been sorted in required order (raw user id, raw item id, rating) beforehand
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df, reader)

### 5-fold cross-validation of all models

#### KNN models

In [None]:
from surprise import KNNBasic

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_knnBasic.pkl')

# instantiate model with winner parameters from GridSearch according to desired measure, e.g. MAE
algo = KNNBasic(sim_options=gs_result.best_params[measure]['sim_options'], k= gs_result.best_params[measure]['k'], min_k=gs_result.best_params[measure]['min_k'])

cv_knnBasic = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_knnBasic)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("knnBasic cv")

In [None]:
from surprise import KNNWithMeans

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_knnMeans.pkl')

# instantiate model with winner parameters from GridSearch according to desired measure, e.g. MAE
algo = KNNWithMeans(sim_options=gs_result.best_params[measure]['sim_options'], k= gs_result.best_params[measure]['k'], min_k=gs_result.best_params[measure]['min_k'])

cv_knnMeans = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_knnMeans)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("knnMeans cv")

In [None]:
from surprise import KNNBaseline

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_knnBaseline.pkl')

# instantiate model with winner parameters from GridSearch according to desired measure, e.g. MAE
algo = KNNBaseline(sim_options=gs_result.best_params[measure]['sim_options'], k= gs_result.best_params[measure]['k'], min_k=gs_result.best_params[measure]['min_k'])

cv_knnBaseline = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_knnBaseline)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("knnBaseline cv")

In [None]:
from surprise import KNNWithZScore

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_knnZScore.pkl')

# instantiate model with winner parameters from GridSearch according to desired measure, e.g. MAE
algo = KNNWithZScore(sim_options=gs_result.best_params[measure]['sim_options'], k= gs_result.best_params[measure]['k'], min_k=gs_result.best_params[measure]['min_k'])

cv_knnZScore = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_knnZScore)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("knnZScore cv")

#### matrix factorization models

In [None]:
from surprise import SVD

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_SVD.pkl')

# instantiate model with winner parameters from GridSearch according to desired measure, e.g. MAE
algo = SVD(n_factors=gs_result.best_params[measure]['n_factors'],
           n_epochs=gs_result.best_params[measure]['n_epochs'],
           biased=gs_result.best_params[measure]['biased'],
           lr_all=gs_result.best_params[measure]['lr_all'],
           reg_all=gs_result.best_params[measure]['reg_all'],
           random_state=42)

cv_SVD = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_SVD)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("SVD cv")

In [None]:
from surprise import NMF

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_NMF.pkl')

# instantiate model with winner parameters from GridSearch according to desired measure, e.g. MAE
algo = NMF(n_factors=gs_result.best_params[measure]['n_factors'],
           n_epochs=gs_result.best_params[measure]['n_epochs'],
           biased=gs_result.best_params[measure]['biased'],
           reg_pu=gs_result.best_params[measure]['reg_pu'],
           reg_qi=gs_result.best_params[measure]['reg_qi'],
           random_state=42)

cv_NMF = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_NMF)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("NMF cv")

#### other models

In [None]:
from surprise import NormalPredictor

# no GridSearchCV was done model does not have parameters

algo = NormalPredictor()

cv_rand = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_rand)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("rand cv")

In [None]:
from surprise import SlopeOne

# no GridSearchCV was done model does not have parameters

algo = SlopeOne()

cv_SlopeOne = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_SlopeOne)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("SlopeOne cv")

In [None]:
from surprise import BaselineOnly

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_BaselineOnly.pkl')

algo = BaselineOnly(bsl_options=gs_result.best_params[measure]['bsl_options'])

cv_BaselineOnly = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_BaselineOnly)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("BaselineOnly cv")

In [None]:
from surprise import CoClustering

# import results from parameter tuning
gs_result = joblib.load('../models/surp_gridsearchcv_CoClustering.pkl')

algo = CoClustering(n_cltr_u=gs_result.best_params[measure]['n_cltr_u'],
                    n_cltr_i=gs_result.best_params[measure]['n_cltr_i'],
                    n_epochs=gs_result.best_params[measure]['n_epochs'],
                    random_state=42)

cv_CoClustering = cross_validate(algo, data, measures=["MAE", "MSE", "RMSE"], cv=5, return_train_measures=True, n_jobs=-1, verbose=True)

print(cv_CoClustering)

# send completion message via email (server, sender, recepient according to .env)
ssm.sendstatus("CoClustering cv")

### concatenate and export results

In [None]:
# collect all dicts with precisions and recalls in one dict
surp_cv_results = {}
surp_cv_results['cv_knnBasic'] = cv_knnBasic
surp_cv_results['cv_knnMeans'] = cv_knnMeans
surp_cv_results['cv_knnBaseline'] = cv_knnBaseline
surp_cv_results['cv_knnZScore'] = cv_knnZScore
surp_cv_results['cv_SVD'] = cv_SVD
surp_cv_results['cv_NMF'] = cv_NMF
surp_cv_results['cv_rand'] = cv_rand
surp_cv_results['cv_SlopeOne'] = cv_SlopeOne
surp_cv_results['cv_BaselineOnly'] = cv_BaselineOnly
surp_cv_results['cv_CoClustering'] = cv_CoClustering

# save dict to pkl
joblib.dump(surp_cv_results, '../models/surp_cv_results.json')
