In [1]:
import pandas as pd
import numpy as np

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

data_path = os.getenv("FILES_PATH")
IMAGES_PATH = os.path.join(data_path, "PNG", "collaborative_filtering")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, extension="png", resolution=300):  # Función para guardar las figuras que se vayan generando
    img_path = os.path.join(IMAGES_PATH, fig_id + "." + extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(img_path, format=extension, dpi=resolution)

In [3]:
from matplotlib import pyplot as plt

# Configuración de parámetros de matplotlib

plt.rc("font", size=14)
plt.rc("axes", labelsize=14, titlesize=14)
plt.rc("legend", fontsize=14)
plt.rc("xtick", labelsize=10)
plt.rc("ytick", labelsize=10)

In [4]:
df = pd.read_csv(os.path.join(data_path, "CSV", "ratings_small.csv"), low_memory=False)

# Collaborative Filtering

El filtro colaborativo es un tipo de sistema de recomendación que se le da a un usuario basado en la similaridad con otros usuarios y la valoración de los usuarios sobre los productos que han visto (en nuestro caso películas). Vamos a realizar un estudio de algoritmos de filtro colaborativo con nuestro set de datos **ratings_small.csv**, que ya está preparado para ser utilizado. En este fichero tenemos IDs de diferentes usuarios y la valoración que le han proporcionado a ciertas películas, de las cuales solamente tenemos el ID.

Vamos a utilizar la librería [surprise](https://surpriselib.com) para la realización de estos modelos sencillos. Esta librería es un [sci-kit](https://projects.scipy.org/scikits.html) de _Python_ que está preparado para el análisis y la implementación de modelos de sistemas de recomendación.

In [7]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD

relevant_cols = ["userId", "movieId", "rating"]
reader = Reader()

# Construye un Dataset de surprise
data = Dataset.load_from_df(df[relevant_cols], reader)

ModuleNotFoundError: No module named 'surprise'

In [6]:
svd_model = SVD(n_factors=100, n_epochs=50, biased=False, lr_all=1e-2, reg_all=0.1)

In [7]:
from surprise.model_selection import cross_validate

cross_validate(svd_model, data, cv=5, measures=["RMSE", "MAE"], verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9203  0.9274  0.9267  0.9311  0.9188  0.9249  0.0046  
MAE (testset)     0.7149  0.7198  0.7222  0.7252  0.7130  0.7190  0.0045  
Fit time          0.90    0.79    0.81    0.78    0.78    0.81    0.05    
Test time         0.06    0.03    0.06    0.03    0.03    0.04    0.01    


{'test_rmse': array([0.92034923, 0.92743441, 0.92669736, 0.93113767, 0.91880171]),
 'test_mae': array([0.71489485, 0.71982049, 0.72223091, 0.72517207, 0.71302293]),
 'fit_time': (0.8988659381866455,
  0.7855687141418457,
  0.8089981079101562,
  0.7817060947418213,
  0.7769689559936523),
 'test_time': (0.06200718879699707,
  0.03414797782897949,
  0.05946493148803711,
  0.0339658260345459,
  0.03248095512390137)}

In [8]:
from surprise import KNNWithMeans, NormalPredictor, CoClustering

knn_model = KNNWithMeans(verbose=False)
np_model = NormalPredictor()
ccl_model = CoClustering(n_cltr_u=5, n_cltr_i=5, n_epochs=50, random_state=42)

In [9]:
models = [svd_model, knn_model, np_model, ccl_model]

result_list = []
for model in models:
    tmp_result = cross_validate(model, data, cv=5, measures=["RMSE", "MAE"], verbose=False)
    tmp_df = pd.DataFrame.from_dict(tmp_result).mean(axis=0)
    tmp_df = pd.concat([tmp_df, pd.Series([str(model).split(" ")[0].split(".")[-1]], index=["Algorithm"])])
    result_list.append(tmp_df)

In [10]:
result_df = pd.DataFrame(result_list).set_index("Algorithm").sort_values("test_rmse", ascending=False)
result_df.head()

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NormalPredictor,1.438581,1.149567,0.026529,0.045492
CoClustering,0.971696,0.751926,1.491682,0.050699
SVD,0.921309,0.715919,0.783217,0.043786
KNNWithMeans,0.918583,0.7031,0.032556,0.459519
