In [None]:
import random
import joblib
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go 
from plotly.subplots import make_subplots

In [None]:
"""Some algorithms randomly initialize their parameters (sometimes with numpy), and the cross-validation folds are also randomly generated. 
If you need to reproduce your experiments multiple times, you just have to set the seed of the RNG at the beginning of your program:"""

my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

<div class="alert alert-block alert-danger"><b>Caution:</b> Always clear all output before pushing to GitHub! This reduces size form 70MB to under 1MB.</div>

### Loading data and preparing data

In [None]:
# Write preprocessed date into a DataFrame
df = pd.read_csv('../data/processed/preprocessed_data_movielens.csv')
df.drop(columns=['title','genres','relevance','tag'], inplace=True)
df = df[['userId', 'movieId', 'rating']]
df.head()

In [None]:
# Load the data into Surprise format, columns have been sorted in required order (raw user id, raw item id, rating) beforehand
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df, reader)

In [None]:
# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Defining options

In [None]:
sim_options = {
    'name': 'cosine',
    'user_based': False  # Compute similarities between items
}

### Trying different models

#### knn models

In [None]:
from surprise import KNNBasic

algo = KNNBasic(sim_options=sim_options)

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_knnb = accuracy.mae(predictions, verbose=True)
mse_knnb = accuracy.mse(predictions, verbose=True)
rmse_knnb = accuracy.rmse(predictions, verbose=True)

print(f"Mean Absolute Error (MAE): {mae_knnb:.4f}")
print(f"Mean Square Error (MSE): {mse_knnb:.4f}")
print(f"Root Mean Square Error (RMSE): {rmse_knnb:.4f}")

In [None]:
from surprise import KNNWithMeans

algo = KNNWithMeans(sim_options=sim_options)

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_knnm = accuracy.mae(predictions, verbose=True)
mse_knnm = accuracy.mse(predictions, verbose=True)
rmse_knnm = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_knnm:.4f}")
# print(f"Mean Square Error (MSE): {mse_knnm:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_knnm:.4f}")

In [None]:
from surprise import KNNBaseline

algo = KNNBaseline(sim_options=sim_options)

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_knnbl = accuracy.mae(predictions, verbose=True)
mse_knnbl = accuracy.mse(predictions, verbose=True)
rmse_knnbl = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_knnbl:.4f}")
# print(f"Mean Square Error (MSE): {mse_knnbl:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_knnbl:.4f}")

In [None]:
from surprise import KNNWithZScore

algo = KNNWithZScore(sim_options=sim_options)

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_knnz = accuracy.mae(predictions, verbose=True)
mse_knnz = accuracy.mse(predictions, verbose=True)
rmse_knnz = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_knnz:.4f}")
# print(f"Mean Square Error (MSE): {mse_knnz:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_knnz:.4f}")

#### matrix factorization models

In [None]:
from surprise import SVD

algo = SVD()

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_svd = accuracy.mae(predictions, verbose=True)
mse_svd = accuracy.mse(predictions, verbose=True)
rmse_svd = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_svd:.4f}")
# print(f"Mean Square Error (MSE): {mse_svd:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_svd:.4f}")

In [None]:
from surprise import NMF

algo = NMF()

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_nmf = accuracy.mae(predictions, verbose=True)
mse_nmf = accuracy.mse(predictions, verbose=True)
rmse_nmf = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_nmf:.4f}")
# print(f"Mean Square Error (MSE): {mse_nmf:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_nmf:.4f}")

#### other models

In [None]:
from surprise import NormalPredictor

algo = NormalPredictor() # no sim_options possible

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_rand = accuracy.mae(predictions, verbose=True)
mse_rand = accuracy.mse(predictions, verbose=True)
rmse_rand = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_rand:.4f}")
# print(f"Mean Square Error (MSE): {mse_rand:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_rand:.4f}")

In [None]:
# estimations = []
# for _, _, _, est, _ in predictions:
#     estimations.append(est)

In [None]:
# print('mean real ratings:', np.mean(df.rating))
# print('mean estimation:', np.mean(estimations))
# plt.subplot(121)
# plt.hist(df.rating, bins=10, rwidth=0.8)
# plt.subplot(122)
# plt.hist(estimations, bins=10, rwidth=0.8);

In [None]:
from surprise import BaselineOnly

algo = BaselineOnly()

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_base = accuracy.mae(predictions, verbose=True)
mse_base = accuracy.mse(predictions, verbose=True)
rmse_base = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_base:.4f}")
# print(f"Mean Square Error (MSE): {mse_base:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_base:.4f}")

In [None]:
from surprise import SlopeOne

algo = SlopeOne()

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_so = accuracy.mae(predictions, verbose=True)
mse_so = accuracy.mse(predictions, verbose=True)
rmse_so = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_so:.4f}")
# print(f"Mean Square Error (MSE): {mse_so:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_so:.4f}")

In [None]:
from surprise import CoClustering

algo = CoClustering()

# Train the model
algo.fit(trainset)

# Generating Predictions
predictions = algo.test(testset)

# Calculating performance metrics
from surprise import accuracy

mae_cc = accuracy.mae(predictions, verbose=True)
mse_cc = accuracy.mse(predictions, verbose=True)
rmse_cc = accuracy.rmse(predictions, verbose=True)

# print(f"Mean Absolute Error (MAE): {mae_cc:.4f}")
# print(f"Mean Square Error (MSE): {mse_cc:.4f}")
# print(f"Root Mean Square Error (RMSE): {rmse_cc:.4f}")

### Graphical interpretation of performance metrics

<div class="alert alert-block alert-danger"><b>Caution:</b> Always clear all output before pushing to GitHub! This reduces size form 70MB to under 1MB.</div>

#### with plotly

In [None]:
metrics_knnb = [mae_knnb, mse_knnb, rmse_knnb]
metrics_knnm = [mae_knnm, mse_knnm, rmse_knnm]
metrics_knnbl = [mae_knnbl, mse_knnbl, rmse_knnbl]
metrics_knnz = [mae_knnz, mse_knnz, rmse_knnz]
metrics_rand = [mae_rand, mse_rand, rmse_rand]
metrics_so = [mae_so, mse_so, rmse_so]
metrics_base = [mae_base, mse_base, rmse_base]
metrics_svd = [mae_svd, mse_svd, rmse_svd]
metrics_nmf = [mae_nmf, mse_nmf, rmse_nmf]
metrics_cc = [mae_cc, mse_cc, rmse_cc]

fig = go.Figure()
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_rand, name = 'NormalPredictor', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_knnb, name = 'KNNBasic', orientation='v'))
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_knnm, name = 'KNNWithMeans', orientation='v')) #, marker_color = 'mediumturquoise'
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_knnbl, name = 'KNNBaseline', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_knnz, name = 'KNNWithZScore', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_so, name = 'SlopeOne', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_base, name = 'BaselineOnly', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_svd, name = 'SVD', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_nmf, name = 'NMF', orientation='v')) 
fig.add_trace(go.Bar(x = ['MAE', 'MSE', 'RMSE'], y = metrics_cc, name = 'CoClustering', orientation='v')) 


fig.update_layout(title = 'Performance metrics for different models of the Surprise library', title_x = 0.5, title_y=0.87, xaxis_title = 'Metric') # Title and axis titles
fig.update_layout(autosize=False, width=1000, height=400) #,legend=dict(orientation="h", y=-0.1)) # Figure size
# layout = go.Layout(legend=dict(orientation="v"))
#fig.update_yaxes(range = [0.6,1])
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.rating))
fig.update_layout(title = 'Rating distribution', title_x = 0.5, xaxis_title = 'rating', yaxis_title = 'frequency') # Title and axis titles
fig.update_layout(autosize=False, width=600, height=400, ) # Figure size
fig.update_layout(bargap=0.2)
fig.show()

### concat and export

In [None]:
import joblib

surp_metrics_default_models = {}
surp_metrics_default_models['knnBasic'] = metrics_knnb
surp_metrics_default_models['knnMeans'] = metrics_knnm
surp_metrics_default_models['knnBaseline'] = metrics_knnbl
surp_metrics_default_models['knnZScore'] = metrics_knnz
surp_metrics_default_models['SVD'] = metrics_svd
surp_metrics_default_models['NMF'] = metrics_nmf
surp_metrics_default_models['NormalPredictor'] = metrics_rand
surp_metrics_default_models['SlopeOne'] = metrics_so
surp_metrics_default_models['BaselineOnly'] = metrics_base
surp_metrics_default_models['CoClustering'] = metrics_cc

# save dict to pkl
joblib.dump(surp_metrics_default_models, '../models/surp_metrics_default_models.pkl')

In [None]:
from pyparsing import col


metrics = joblib.load('../models/surp_metrics_default_models.pkl')

# create empty DataFrame with columns according to metrics
keys = list(metrics.keys()) # list of keys, which hold the model names
df_metrics = pd.DataFrame(metrics, index=['MAE','MSE','RMSE']).T # e.g. use first model to retrieve coumns

display(df_metrics)

<div class="alert alert-block alert-danger"><b>Caution:</b> Always clear all output before pushing to GitHub! This reduces size form 70MB to under 1MB.</div>

### Comparison with whole dataset? (inactive)

In [None]:
# import os

# from surprise import BaselineOnly, Dataset, Reader
# from surprise.model_selection import cross_validate

# # path to dataset file
# file_path = '../data/raw/ml-25m/ratings.csv'

# # As we're loading a custom dataset, we need to define a reader. In the movielens-25M dataset, each line has the following format:
# # 'user item rating timestamp', separated by ',' characters.
# # the rating scale differs from the default which is (1,5)
# # the first line of the csv-file holds the column labels and has to be skipped
# reader = Reader(line_format="user item rating timestamp", sep=",", rating_scale=(0.5,5.0), skip_lines=1)

# data_raw = Dataset.load_from_file(file_path, reader=reader)

# # We can now use this dataset as we please, e.g. calling cross_validate
# cross_validate(BaselineOnly(), data_raw, verbose=True)

### Parameter tuning

In [None]:
# # define sim_options to be tested
# sim_options = {
# "name": ["msd", "cosine", "pearson"],
# "min_support": [3, 4, 5],
# "user_based": [False], # only item-base approach, since it is generally better suited for the task and user based would require enormous amounts of memory
# }
# param_grid = {"sim_options": sim_options,
#               "k": [20, 30, 40], # The (max) number of neighbors to take into account for aggregation
#               "min_k": [1, 2, 3]} # The minimum number of neighbors to take into account for aggregation
# gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mse", "mae"], cv=3)
# gs.fit(data)
# print(gs.best_score["rmse"])
# print(gs.best_params["rmse"])

In [None]:
# print(gs.best_params)

In [None]:
# # save GridSearchCV object to file in models folder
# import joblib
# joblib.dump(gs, '../models/surp_gridsearchcv_knnMeans.pkl')

In [None]:
# # try if it can be loaded successfully
# import joblib
# gs_loaded = joblib.load('../models/surp_gridsearchcv_knnMeans.pkl')


In [None]:
# print(gs_loaded.best_score)
# print(gs_loaded.best_params['rmse']['sim_options'])
# print(gs_loaded.best_params['rmse']['k'])
# print(gs_loaded.best_params['rmse']['min_k'])
# gs_loaded.best_params

In [None]:
# algo = KNNWithMeans(sim_options=gs_loaded.best_params['rmse']['sim_options'],
#                     k=gs_loaded.best_params['rmse']['k'],
#                     min_k=gs_loaded.best_params['rmse']['min_k'])

# # Train the model
# algo.fit(trainset)

# # Generating Predictions
# predictions = algo.test(testset)

# # Calculating performance metrics
# from surprise import accuracy

# mae_knnm = accuracy.mae(predictions, verbose=True)
# mse_knnm = accuracy.mse(predictions, verbose=True)
# rmse_knnm = accuracy.rmse(predictions, verbose=True)

### Cross-validation

In [None]:
# import random
# import numpy as np
# from surprise import KNNWithMeans
# from surprise.model_selection import cross_validate

# my_seed = 42
# random.seed(my_seed)
# np.random.seed(my_seed)

# measure = 'mae'
# # We can now use this dataset as we please, e.g. calling cross_validate
# cv_knnMeans = cross_validate(KNNWithMeans(sim_options=gs_loaded.best_params[measure]['sim_options'],
#                                       k=gs_loaded.best_params[measure]['k'],
#                                       min_k=gs_loaded.best_params[measure]['min_k']),
#                         data, measures=["MAE", "MSE", "RMSE"], n_jobs=-1, verbose=True)

# print(cv_knnMeans)

In [None]:
# # save cross-validation object to file in models folder
# joblib.dump(cv_knnm, '../models/surp_cv_GS_winner_knnMeans.pkl')

In [None]:
# cv_loaded = joblib.load('../models/surp_cv_GS_winner_knnMeans.pkl')

In [None]:
# cv_loaded

In [None]:
# # We can now use this dataset as we please, e.g. calling cross_validate
# sim_options = {
#     'name': 'cosine',
#     'user_based': False  # Compute similarities between items
# }
# cv_knnb = cross_validate(KNNBasic(sim_options=sim_options), data, verbose=True)
# print(cv_knnb)

In [None]:
# print('Element 0 of dict:', cv_knnb['test_rmse'][0])
# print('RSME mean:', cv_knnb['test_rmse'].mean())

In [None]:
# # We can now use this dataset as we please, e.g. calling cross_validate
# cross_validate(BaselineOnly(), data, verbose=True)

### Train on a whole trainset and the predict() method

In [None]:
# from surprise import KNNBasic

# # Retrieve the trainset.
# trainset_full = data.build_full_trainset()

# # Build an algorithm, and train it.
# sim_options_full = {
#     'name': 'cosine',
#     'user_based': False  # Compute similarities between items
# }
# algo_full = KNNBasic(sim_options=sim_options_full)
# algo_full.fit(trainset_full)

# uid = str(74244)  # raw user id (as in the ratings file). They are **strings**!
# iid = str(1)  # raw item id (as in the ratings file). They are **strings**!

# # get a prediction for specific users and items. r_ui represents the true, known rating.
# pred = algo_full.predict(uid, iid, r_ui=4, verbose=True)

# # Note: The predict() uses raw ids. As the dataset we have used has been read from a file, the raw ids are strings (even if they represent numbers).

In [None]:
# uid = str(124114)  # raw user id (as in the ratings file). They are **strings**!
# iid = str(542)  # raw item id (as in the ratings file). They are **strings**!

# # get a prediction for specific users and items. r_ui represents the true, known rating.
# pred = algo_full.predict(uid, iid, r_ui=4, verbose=True)