In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

# Load the movie Lens class
ml = MovieLens()

# Algorithm

In [2]:
from surprise import Dataset, Reader, SVD, SVDpp
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
import pandas as pd
import numpy as np

In [3]:
# Load the ratings dataset
ratings = ml.ratings.copy()

# Method from the Surprise library to load the DataFrame 
# Define the Reader object to parse the dataframe
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load the dataframe as a ratings dataset
ratingsDataset = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Build the full trainset
trainSet, testSet = train_test_split(ratingsDataset, test_size=0.2, random_state=42)
antitest = trainSet.build_anti_testset()

## 1. SVD (Singular Value Decomposition)

In [4]:
SVD = SVD()
SVD.fit(trainSet)

# Get test and antitest predictions
predtest = SVD.test(testSet)
predantitest = SVD.test(antitest)

In [5]:
# Perform Hyperparameter tuning
print("Searching for best parameters...")
param_grid = {'n_epochs': [20, 30], 'lr_all': [0.005, 0.010], 'n_factors': [50, 100]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(trainSet)

params = gs.best_params['rmse']
SVDtuned = SVD(n_epochs=params['n_epochs'], lr_all=params['lr_all'], n_factors=params['n_factors'])

# Get test predictions
predtestTuned = SVDtuned.test(testSet)

Searching for best parameters...


AttributeError: 'Trainset' object has no attribute 'raw_ratings'

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

## 2. SVDPlusPlus

In [6]:
SVDPlusPlus = SVDpp()
SVDPlusPlus.fit(trainSet)

# Get test and antitest predictions
predtestPlusPlus = SVDPlusPlus.test(testSet)
predantitestPlusPlus = SVDPlusPlus.test(antitest)

# Metrics

In [7]:
from metrics import evaluationMetrics
em = evaluationMetrics()

In [8]:
# Get top N recommended movies for each user based on estimated ratings
top_10_SVD = em.getTopN(predantitest,minimumRating = 3.5)
top_10_SVD_PlusPlus = em.getTopN(predantitestPlusPlus,minimumRating = 3.5)

## Métricas de precisión: RMSE y MAE

In [9]:
# RMSE
rmse = accuracy.rmse(predtest)
rmsePlusPlus = accuracy.rmse(predtestPlusPlus)

# MAE
mae = accuracy.mae(predtest)
maePlusPlus = accuracy.mae(predtestPlusPlus)

RMSE: 0.8793
RMSE: 0.8694
MAE:  0.6763
MAE:  0.6664


## Métricas de relevancia: Precision, Recall y NDCG

In [10]:
# Precision
precisions = em.getPrecision(predtest, k=10, threshold=3.5)
precisionsPlusPlus = em.getPrecision(predtestPlusPlus, k=10, threshold=3.5)

# Mean Average Precision
mapSVD = np.mean(list(precisions.values()))
mapSVDPlusPlus = np.mean(list(precisionsPlusPlus.values()))

# Recall
recalls = em.getRecall(predtest, k=10, threshold=3.5)
recallsPlusPlus = em.getRecall(predtestPlusPlus, k=10, threshold=3.5)

# Mean Average Recall
marSVD = np.mean(list(recalls.values()))
marSVDPlusPlus = np.mean(list(recallsPlusPlus.values()))

# Normalized discounted cummulative gain (NDCG)
ndcgs, mean_ndcg = em.getNDCG(predtest,10)
ndcgsPlusPlus, mean_ndcgPlusPlus = em.getNDCG(predtestPlusPlus,10)

## Otras métricas de interés: Coverage, User Coverage y Novelty

In [11]:
# Coverage
coverage = em.getCoverage(top_10_SVD,trainSet.n_items,trainSet.all_users())
coveragePlusPlus = em.getCoverage(top_10_SVD_PlusPlus,trainSet.n_items,trainSet.all_users())

# User coverage
user_coverage = em.getUserCoverage(top_10_SVD, trainSet.n_users,4)
user_coveragePlusPlus = em.getUserCoverage(top_10_SVD_PlusPlus, trainSet.n_users,4)

# Novelty
novelty = em.getNovelty(top_10_SVD,trainSet)
noveltyPlusPlus = em.getNovelty(top_10_SVD_PlusPlus,trainSet)

Por último creamos un dataframe con todas las métricas de evaluación asociadas a cada uno de los modelos

In [12]:
cols = ["Model","RMSE","MAE","MAP","MAR","Mean_NDCG","Coverage","User_Coverage","Novelty"]
metrics_data = []

# Append the results to the list of dictionaries
metrics_data.append({"Model": "SVD", "RMSE": rmse, "MAE": mae, "MAP": mapSVD, "MAR": marSVD,
                     "Mean_NDCG": mean_ndcg, "Coverage": coverage, "User_Coverage": user_coverage,
                     "Novelty": novelty})

metrics_data.append({"Model": "SVDPlusPlus", "RMSE": rmsePlusPlus, "MAE": maePlusPlus,
                     "MAP": mapSVDPlusPlus, "MAR": marSVDPlusPlus, "Mean_NDCG": mean_ndcgPlusPlus,
                     "Coverage": coveragePlusPlus, "User_Coverage": user_coveragePlusPlus,
                     "Novelty": noveltyPlusPlus})

# Convert the list of dictionaries into a DataFrame
metrics_df = pd.DataFrame(metrics_data, columns=cols)
metrics_df

Unnamed: 0,Model,RMSE,MAE,MAP,MAR,Mean_NDCG,Coverage,User_Coverage,Novelty
0,SVD,0.879299,0.676252,0.74348,0.45381,0.953781,0.038194,0.921311,406.011905
1,SVDPlusPlus,0.869411,0.666444,0.739197,0.452484,0.955526,0.03349,0.913115,770.394635
