In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from funciones import (preprocesar_dataframe_animes, 
                       rating_average, correlation_similarity, jmsd_similarity, # Funciones de similitud y avg
                       get_neighbors, # Búsqueda de vecinos
                       average_prediction, weighted_average_prediction, deviation_from_mean_prediction, # Agregación
                       get_recommendations, # Para métricas de ranking
                       get_mae, get_rmse, get_precision, get_recall, get_f1, get_ndcg, # Métricas globales
                       # No necesitas importar las get_user_* de métricas aquí si solo usas las globales
                       # a menos que quieras hacer análisis por usuario.
                       # También importa has_test_ratings, get_ordered_test_animes, get_user_idcg, get_user_dcg si son usadas por las métricas globales
                       # que importas, lo cual parece ser el caso para nDCG.
                       get_ordered_test_animes, get_user_idcg, get_user_dcg, # Necesarias para get_ndcg
                       get_metricas # Tu función para imprimir todas las métricas
                      )
RANDOM_STATE = 42 


In [7]:
PATH_RATINGS_CSV = 'csv/rating.csv'    # Ajusta esta ruta
MIN_USER_RATINGS = 5                   # Tu umbral
MIN_ANIME_RATINGS = 5                  # Tu umbral
SAMPLE_NUM_USERS_FOR_DEV = 1000

(train_df, test_df, 
 ratings_train_matrix, ratings_test_matrix,
 NUM_USERS, NUM_ANIMES, MIN_RATING, MAX_RATING, SCORES
) = preprocesar_dataframe_animes( # Tu función importada
    dataframe_path=PATH_RATINGS_CSV, # El nombre del parámetro en tu función
    n_user_ratings=MIN_USER_RATINGS, # El nombre del parámetro en tu función
    n_anime_ratings=MIN_ANIME_RATINGS, # El nombre del parámetro en tu función
    num_users=SAMPLE_NUM_USERS_FOR_DEV,    # El nombre del parámetro en tu función
    test_size=0.2,
    RANDOM_STATE=RANDOM_STATE
)


print(f"  NUM_USERS: {NUM_USERS}, NUM_ANIMES: {NUM_ANIMES}")
print(f"  Tamaño train_df: {len(train_df)}, Tamaño test_df: {len(test_df)}")

  NUM_USERS: 1000, NUM_ANIMES: 3914
  Tamaño train_df: 83964, Tamaño test_df: 20968


In [8]:
train_df.head()

Unnamed: 0,user_id,anime_id,rating
49219,463,3777,7
94597,861,506,9
34307,318,1997,7
59130,552,1948,8
64556,588,39,9


In [16]:
print(sorted(train_df.rating.unique()) )

[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]


In [20]:
# los valores posibles de rating
SCORES = sorted(train_df.rating.unique())
NUM_FACTORS = 7

# Inicialización aleatoria de U y V
U = [[[random.random() for _ in range(NUM_FACTORS)] for _ in range(NUM_USERS)] for _ in range(len(SCORES))]
V = [[[random.random() for _ in range(NUM_FACTORS)] for _ in range(NUM_ANIMES)] for _ in range(len(SCORES))]

definimos la funcion `logit` que sirve para convertir un rating en una probabilidad interpretable para el modelo

In [23]:
def logit(x):
    return 1 / (1 + np.exp(-x))

In [25]:
NUM_ITERATIONS = 35
LEARNING_RATE = 0.001
REGULARIZATION = 0.1

for it in range(NUM_ITERATIONS):
    print(f"iter {it+1}/{NUM_ITERATIONS}")
    for s_idx, s in enumerate(SCORES):

        # users
        for u in range(NUM_USERS):
            delta = np.zeros(NUM_FACTORS)
            for i in range(NUM_ANIMES):
                if ratings_train_matrix[u][i] is not None:
                    dot = np.dot(U[s_idx][u], V[s_idx][i])
                    for f in range(NUM_FACTORS):
                        if ratings_train_matrix[u][i] == s:
                            delta[f] += (1 - logit(dot)) * V[s_idx][i][f]
                        else:
                            delta[f] -= logit(dot) * V[s_idx][i][f]
                    for f in range(NUM_FACTORS):
                        U[s_idx][u][f] += LEARNING_RATE * (delta[f] - REGULARIZATION * U[s_idx][u][f])

        #items
        for i in range(NUM_ANIMES):
            theta = np.zeros(NUM_FACTORS)
            for u in range(NUM_USERS):
                if ratings_train_matrix[u][i] is not None:
                    dot = np.dot(U[s_idx][u], V[s_idx][i])
                    for f in range(NUM_FACTORS):
                        if ratings_train_matrix[u][i] == s:
                            theta[f] += (1 - logit(dot)) * U[s_idx][u][f]
                        else:
                            theta[f] -= logit(dot) * U[s_idx][u][f]
                    for f in range(NUM_FACTORS):
                        V[s_idx][i][f] += LEARNING_RATE * (theta[f] - REGULARIZATION * V[s_idx][i][f])


iter 1/35


  return 1 / (1 + np.exp(-x))


iter 2/35
iter 3/35
iter 4/35
iter 5/35
iter 6/35
iter 7/35
iter 8/35
iter 9/35
iter 10/35
iter 11/35
iter 12/35
iter 13/35
iter 14/35
iter 15/35
iter 16/35
iter 17/35
iter 18/35
iter 19/35
iter 20/35
iter 21/35
iter 22/35
iter 23/35
iter 24/35
iter 25/35
iter 26/35
iter 27/35
iter 28/35
iter 29/35
iter 30/35
iter 31/35
iter 32/35
iter 33/35
iter 34/35
iter 35/35


In [26]:
def compute_prediction(u, i):
    prediction = None
    prob = 0
    for s_idx, s in enumerate(SCORES):
        dot = np.dot(U[s_idx][u], V[s_idx][i])
        p = logit(dot)
        if p > prob:
            prob = p
            prediction = s
    return prediction


In [27]:
# hacemos las predicciones
predictions_bernoulli = [[None for _ in range(NUM_ANIMES)] for _ in range(NUM_USERS)]
for u in range(NUM_USERS):
    for i in range(NUM_ANIMES):
        if ratings_test_matrix[u][i] is not None:
            predictions_bernoulli[u][i] = compute_prediction(u, i)

# Evaluación
print("metricas de Bernoulli Matrix Factorization:")
get_metricas(ratings_test_matrix, NUM_ANIMES, NUM_USERS, predictions_bernoulli, theta=7, N=10)


metricas de Bernoulli Matrix Factorization:
MAE =  1.252986384148154
RMSE =  1.5965352396166503
Precision =  0.8930103174603146
Recall =  0.7155707905937282
F1 =  0.7402776022257248
