Creamos la matriz de ratings

In [1]:
import pandas as pd
import numpy as np
import math
import time
from funciones import (preprocesar_dataframe_animes, 
                       rating_average, correlation_similarity, jmsd_similarity, # Funciones de similitud y avg
                       get_neighbors, # Búsqueda de vecinos
                       average_prediction, weighted_average_prediction, deviation_from_mean_prediction, # Agregación
                       get_recommendations, # Para métricas de ranking
                       get_mae, get_rmse, get_precision, get_recall, get_f1, get_ndcg, # Métricas globales
                       # No necesitas importar las get_user_* de métricas aquí si solo usas las globales
                       # a menos que quieras hacer análisis por usuario.
                       # También importa has_test_ratings, get_ordered_test_animes, get_user_idcg, get_user_dcg si son usadas por las métricas globales
                       # que importas, lo cual parece ser el caso para nDCG.
                       get_ordered_test_animes, get_user_idcg, get_user_dcg, # Necesarias para get_ndcg
                       get_metricas # Tu función para imprimir todas las métricas
                      )
RANDOM_STATE = 42 


In [2]:
PATH_RATINGS_CSV = 'csv/rating.csv'    # Ajusta esta ruta
MIN_USER_RATINGS = 5                   # Tu umbral
MIN_ANIME_RATINGS = 5                  # Tu umbral
SAMPLE_NUM_USERS_FOR_DEV = 1000

(train_df, test_df, 
 ratings_train_matrix, ratings_test_matrix,
 NUM_USERS, NUM_ANIMES, MIN_RATING, MAX_RATING, SCORES
) = preprocesar_dataframe_animes( # Tu función importada
    dataframe_path=PATH_RATINGS_CSV, # El nombre del parámetro en tu función
    n_user_ratings=MIN_USER_RATINGS, # El nombre del parámetro en tu función
    n_anime_ratings=MIN_ANIME_RATINGS, # El nombre del parámetro en tu función
    num_users=SAMPLE_NUM_USERS_FOR_DEV,    # El nombre del parámetro en tu función
    test_size=0.2,
    RANDOM_STATE=RANDOM_STATE
)


print(f"  NUM_USERS: {NUM_USERS}, NUM_ANIMES: {NUM_ANIMES}")
print(f"  Tamaño train_df: {len(train_df)}, Tamaño test_df: {len(test_df)}")

  NUM_USERS: 1000, NUM_ANIMES: 3914
  Tamaño train_df: 83964, Tamaño test_df: 20968


In [3]:
# Cálculo del modelo base: predicción por media del usuario
from funciones import rating_average, get_metricas

# inicializamos la matriz de predicciones
predictions_media = [[None for _ in range(NUM_ANIMES)] for _ in range(NUM_USERS)]

# rellenamos la predicción con la media del usuario u
for u in range(NUM_USERS):
    avg_u = rating_average(ratings_train_matrix, NUM_ANIMES, u)
    for i in range(NUM_ANIMES):
        predictions_media[u][i] = avg_u

# evaluamos el modelo base
print("Modelo base (media del usuario):")
get_metricas(ratings_test_matrix, NUM_ANIMES, NUM_USERS, predictions_media, theta=7, N=10)


Modelo base (media del usuario):
MAE =  1.0278554327359957
RMSE =  1.2332543271239766
Precision =  0.916025641025641
Recall =  0.021991373376716507
F1 =  0.13880496180538138


Vamos a implementar ahora la JMSD similitud

In [5]:
from funciones import jmsd_similarity, get_neighbors, average_prediction, has_test_ratings

k = 10 

predictions_knn = [[None for _ in range(NUM_ANIMES)] for _ in range(NUM_USERS)]

for u in range(NUM_USERS):
    if has_test_ratings(ratings_test_matrix, NUM_ANIMES, u):
        
        similarities = [None if u == v else jmsd_similarity(
            ratings_train_matrix, NUM_ANIMES, MIN_RATING, MAX_RATING, u, v
        ) for v in range(NUM_USERS)]
        
        neighbors = get_neighbors(k, similarities)
        
        for i in range(NUM_ANIMES):
            if ratings_test_matrix[u][i] is not None:
                predictions_knn[u][i] = average_prediction(ratings_train_matrix, i, neighbors)

print("Modelo KNN (k=10, jmsd + average):")
get_metricas(ratings_test_matrix, NUM_ANIMES, NUM_USERS, predictions_knn, theta=7, N=10)


Modelo KNN (k=10, jmsd + average):
MAE =  1.2259263980624535
RMSE =  1.479130055834072
Precision =  0.9069769503546085
Recall =  0.7666369509971367
F1 =  0.7876908848425289


El KNN ha mejorado bastante el Recall y el F1 respecto al modelo base, que es lo que nos interesa para la recomendación real, pero ha empeorado el MAE y RMSE. Esto era esperable porque no predice tan bien valores exactos, sino que está buscando acertar los items más relevantes.

Igualmente la precisión es bastante alta.

In [6]:
ks = [5, 20, 40]

for k in ks:
    print(f"\n k evaluado = {k}")
    predictions_knn = [[None for _ in range(NUM_ANIMES)] for _ in range(NUM_USERS)]

    for u in range(NUM_USERS):
        if has_test_ratings(ratings_test_matrix, NUM_ANIMES, u):

            similarities = [
                None if u == v else jmsd_similarity(
                    ratings_train_matrix, NUM_ANIMES, MIN_RATING, MAX_RATING, u, v
                ) for v in range(NUM_USERS)
            ]

            neighbors = get_neighbors(k, similarities)

            for i in range(NUM_ANIMES):
                if ratings_test_matrix[u][i] is not None:
                    predictions_knn[u][i] = average_prediction(ratings_train_matrix, i, neighbors)

    print(f"\n métricas para KNN (k = {k}, jmsd + average):")
    get_metricas(ratings_test_matrix, NUM_ANIMES, NUM_USERS, predictions_knn, theta=7, N=10)



 k evaluado = 5

 métricas para KNN (k = 5, jmsd + average):
MAE =  1.220294728444326
RMSE =  1.47181896265817
Precision =  0.9101308235126809
Recall =  0.7956852987257566
F1 =  0.8106379525212875

 k evaluado = 20

 métricas para KNN (k = 20, jmsd + average):
MAE =  1.2189339836591828
RMSE =  1.4692976503572879
Precision =  0.9053714908828528
Recall =  0.7474217181394548
F1 =  0.7682610649469984

 k evaluado = 40

 métricas para KNN (k = 40, jmsd + average):
MAE =  1.17178385273052
RMSE =  1.4111055663517262
Precision =  0.9053096525966686
Recall =  0.7355213354375992
F1 =  0.7601104267780977


vamos a probar otras formas de prediccion

In [7]:
from funciones import jmsd_similarity, get_neighbors, average_prediction, weighted_average_prediction, deviation_from_mean_prediction, has_test_ratings, get_metricas

def evaluar_predicciones(k, metodo_pred):
    print(f"\nk = {k} y método: {metodo_pred.__name__}")
    predictions_knn = [[None for _ in range(NUM_ANIMES)] for _ in range(NUM_USERS)]

    for u in range(NUM_USERS):
        if has_test_ratings(ratings_test_matrix, NUM_ANIMES, u):

            similarities = [
                None if u == v else jmsd_similarity(
                    ratings_train_matrix, NUM_ANIMES, MIN_RATING, MAX_RATING, u, v
                ) for v in range(NUM_USERS)
            ]

            neighbors = get_neighbors(k, similarities)

            for i in range(NUM_ANIMES):
                if ratings_test_matrix[u][i] is not None:
                    if metodo_pred == deviation_from_mean_prediction:
                        predictions_knn[u][i] = metodo_pred(ratings_train_matrix, NUM_ANIMES, u, i, neighbors)
                    elif metodo_pred == weighted_average_prediction:
                        predictions_knn[u][i] = metodo_pred(ratings_train_matrix, i, neighbors, similarities)
                    else:
                        predictions_knn[u][i] = metodo_pred(ratings_train_matrix, i, neighbors)

    get_metricas(ratings_test_matrix, NUM_ANIMES, NUM_USERS, predictions_knn, theta=7, N=10)


# Ejecutamos para k = 5 y 10, con los tres métodos
for k in [5, 10]:
    for metodo in [average_prediction, weighted_average_prediction, deviation_from_mean_prediction]:
        evaluar_predicciones(k, metodo)



k = 5 y método: average_prediction
MAE =  1.220294728444326
RMSE =  1.47181896265817
Precision =  0.9101308235126809
Recall =  0.7956852987257566
F1 =  0.8106379525212875

k = 5 y método: weighted_average_prediction
MAE =  1.2220757342386632
RMSE =  1.4729041334256756
Precision =  0.9101308235126809
Recall =  0.796131376106811
F1 =  0.8109605824400361

k = 5 y método: deviation_from_mean_prediction
MAE =  1.0983929015237512
RMSE =  1.310906489393617
Precision =  0.911922536838549
Recall =  0.7964706879557367
F1 =  0.8117079788021256

k = 10 y método: average_prediction
MAE =  1.2259263980624535
RMSE =  1.479130055834072
Precision =  0.9069769503546085
Recall =  0.7666369509971367
F1 =  0.7876908848425289

k = 10 y método: weighted_average_prediction
MAE =  1.2275187871039943
RMSE =  1.4788640518981893
Precision =  0.9072960992907789
Recall =  0.7670610639054529
F1 =  0.7880267769162118

k = 10 y método: deviation_from_mean_prediction
MAE =  1.095439158821956
RMSE =  1.3098715393968687