In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import numpy as np


In [1]:
# Librerías
import pandas as pd
import numpy as np


movies_path = "ml-1m/movies.dat"
ratings_path = "ml-1m/ratings.dat"
users_path = "ml-1m/users.dat"


# Carga de los datos
users = pd.read_csv(users_path, sep="::", engine="python", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], encoding="latin-1")
movies = pd.read_csv(movies_path, sep="::", engine="python", names=["MovieID", "Title", "Genres"], encoding="latin-1")
ratings = pd.read_csv(ratings_path, sep="::", engine="python", names=["UserID", "MovieID", "Rating", "Timestamp"], encoding="latin-1")

# Mostrar primeras filas
ratings.head()



Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['UserID'])
ratings['movie'] = movie_encoder.fit_transform(ratings['MovieID'])

# Guardamos número de usuarios y películas únicos
num_users = ratings['user'].nunique()
num_movies = ratings['movie'].nunique()

num_users, num_movies


(6040, 3706)

In [3]:
# Obtener lista única de géneros
movies['Genres'] = movies['Genres'].str.split('|')
all_genres = sorted(set(g for sublist in movies['Genres'] for g in sublist))
genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}

# Crear codificación multi-hot para cada película
def encode_genres(genres_list):
    multi_hot = np.zeros(len(genre_to_idx), dtype=np.float32)
    for genre in genres_list:
        if genre in genre_to_idx:
            multi_hot[genre_to_idx[genre]] = 1.0
    return multi_hot

movies['genre_vector'] = movies['Genres'].apply(encode_genres)


In [4]:
from sklearn.preprocessing import LabelEncoder

# Fusionar ratings con info del usuario
ratings_full = ratings.merge(users, on="UserID")

# Codificar gender (F/M → 0/1 por ejemplo)
gender_encoder = LabelEncoder()
ratings_full['gender'] = gender_encoder.fit_transform(ratings_full['Gender'])

# Codificar age y occupation como categorías
age_encoder = LabelEncoder()
occ_encoder = LabelEncoder()

ratings_full['age'] = age_encoder.fit_transform(ratings_full['Age'])
ratings_full['occupation'] = occ_encoder.fit_transform(ratings_full['Occupation'])

# Mostrar las clases para documentar
print("Géneros:", gender_encoder.classes_)
print("Edades:", age_encoder.classes_)
print("Ocupaciones:", occ_encoder.classes_)

ratings_full[['user', 'movie', 'Rating', 'gender', 'age', 'occupation']].head()


Géneros: ['F' 'M']
Edades: [ 1 18 25 35 45 50 56]
Ocupaciones: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


Unnamed: 0,user,movie,Rating,gender,age,occupation
0,0,1104,5,0,0,10
1,0,639,3,0,0,10
2,0,853,3,0,0,10
3,0,3177,4,0,0,10
4,0,2162,5,0,0,10


In [5]:
ratings_full = ratings_full.merge(movies[['MovieID', 'genre_vector']], on='MovieID')

ratings_full.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,user,movie,Gender,Age,Occupation,Zip-code,gender,age,occupation,genre_vector
0,1,1193,5,978300760,0,1104,F,1,10,48067,0,0,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,1,661,3,978302109,0,639,F,1,10,48067,0,0,10,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,914,3,978301968,0,853,F,1,10,48067,0,0,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,3408,4,978300275,0,3177,F,1,10,48067,0,0,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1,2355,5,978824291,0,2162,F,1,10,48067,0,0,10,"[0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [6]:
# Calcular media por usuario
user_mean_rating = ratings_full.groupby('user')['Rating'].mean().to_dict()

# Crear nueva columna con rating centrado
ratings_full['rating_centered'] = ratings_full.apply(
    lambda row: row['Rating'] - user_mean_rating[row['user']], axis=1
)


In [None]:
import torch.nn as nn

class FullHybridNCF_BN(nn.Module):
    def __init__(self, num_users, num_items, num_genders, num_ages, num_occs, num_genres,
                 embedding_dim=64, mlp_layers=[128, 16, 32], dropout=0.3):
        super(FullHybridNCF_BN, self).__init__()

        # Embeddings usuario-película
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, embedding_dim)

        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)

        # Embeddings de metadata
        self.gender_embedding = nn.Embedding(num_genders, 4)
        self.age_embedding = nn.Embedding(num_ages, 4)
        self.occ_embedding = nn.Embedding(num_occs, 4)

        # Proyección de géneros (multi-hot → denso)
        self.genre_dense = nn.Linear(num_genres, 8)

        # MLP con BatchNorm
        mlp_input_size = 2 * embedding_dim + 3 * 4 + 8
        layers = []
        for layer_size in mlp_layers:
            layers.append(nn.Linear(mlp_input_size, layer_size))
            layers.append(nn.BatchNorm1d(layer_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            mlp_input_size = layer_size
        self.mlp = nn.Sequential(*layers)

        # Capa de salida
        self.final_layer = nn.Linear(embedding_dim + mlp_layers[-1], 1)

    def forward(self, user_ids, item_ids, gender_ids, age_ids, occ_ids, genre_vecs):
        # GMF
        gmf_user = self.user_embedding_gmf(user_ids)
        gmf_item = self.item_embedding_gmf(item_ids)
        gmf_output = gmf_user * gmf_item

        # MLP
        mlp_user = self.user_embedding_mlp(user_ids)
        mlp_item = self.item_embedding_mlp(item_ids)
        gender_emb = self.gender_embedding(gender_ids)
        age_emb = self.age_embedding(age_ids)
        occ_emb = self.occ_embedding(occ_ids)
        genre_proj = self.genre_dense(genre_vecs)

        mlp_input = torch.cat([mlp_user, mlp_item, gender_emb, age_emb, occ_emb, genre_proj], dim=-1)
        mlp_output = self.mlp(mlp_input)

        # Concatenar GMF + MLP
        final_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.final_layer(final_input)
        return output.squeeze()


In [8]:
from torch.utils.data import Dataset, DataLoader

class CenteredHybridDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['movie'].values, dtype=torch.long)
        self.centered_ratings = torch.tensor(df['rating_centered'].values, dtype=torch.float)
        self.genders = torch.tensor(df['gender'].values, dtype=torch.long)
        self.ages = torch.tensor(df['age'].values, dtype=torch.long)
        self.occs = torch.tensor(df['occupation'].values, dtype=torch.long)
        self.genres = torch.tensor(np.stack(df['genre_vector'].values), dtype=torch.float)
        self.user_means = torch.tensor(df['user'].map(user_mean_rating).values, dtype=torch.float)

    def __len__(self):
        return len(self.centered_ratings)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.items[idx],
            self.genders[idx],
            self.ages[idx],
            self.occs[idx],
            self.genres[idx],
            self.centered_ratings[idx],
            self.user_means[idx]
        )


In [11]:
from sklearn.model_selection import train_test_split

# Dividir de nuevo con ratings centrados
train_df, test_df = train_test_split(ratings_full, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Datasets y loaders con rating centrado
train_dataset = CenteredHybridDataset(train_df)
val_dataset = CenteredHybridDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)




In [20]:
def train_centered_model(model, train_loader, val_loader, epochs=10, lr=0.001, device='cpu', patience=5):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    best_val_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        train_losses = []

        for user, item, gender, age, occ, genre_vec, centered_rating, user_mean in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            user, item = user.to(device), item.to(device)
            gender, age, occ = gender.to(device), age.to(device), occ.to(device)
            genre_vec = genre_vec.to(device)
            centered_rating = centered_rating.to(device)
            user_mean = user_mean.to(device)

            optimizer.zero_grad()
            pred_deviation = model(user, item, gender, age, occ, genre_vec)
            pred_rating = pred_deviation + user_mean
            true_rating = centered_rating + user_mean
            loss = criterion(pred_rating, true_rating)

            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        # Validación
        model.eval()
        val_losses = []
        with torch.no_grad():
            for user, item, gender, age, occ, genre_vec, centered_rating, user_mean in val_loader:
                user, item = user.to(device), item.to(device)
                gender, age, occ = gender.to(device), age.to(device), occ.to(device)
                genre_vec = genre_vec.to(device)
                centered_rating = centered_rating.to(device)
                user_mean = user_mean.to(device)

                pred_deviation = model(user, item, gender, age, occ, genre_vec)
                pred_rating = pred_deviation + user_mean
                true_rating = centered_rating + user_mean
                val_loss = criterion(pred_rating, true_rating)
                val_losses.append(val_loss.item())

        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        val_rmse = np.sqrt(avg_val_loss)

        print(f"Epoch {epoch+1}: Train MSE = {avg_train_loss:.4f}, Val MSE = {avg_val_loss:.4f}, Val RMSE = {val_rmse:.4f}")

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), "v9_best_model.pth")
            print("✅ Mejor modelo guardado.")
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("🛑 Early stopping activado. No hay mejora en la validación.")
                break

    print("🎯 Entrenamiento finalizado.")


In [13]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [19]:
# Parámetros de dimensiones
num_genres = len(genre_to_idx)

# Inicialización del modelo completo
full_model = FullHybridNCF_BN(
    num_users=num_users,
    num_items=num_movies,
    num_genders=ratings_full['gender'].nunique(),
    num_ages=ratings_full['age'].nunique(),
    num_occs=ratings_full['occupation'].nunique(),
    num_genres=num_genres,
    embedding_dim=64,
    mlp_layers=[128, 16, 32],
    dropout=0.12174914809192425
).to(device)




Mejores hiperparámetros:
{'embedding_dim': 64, 
'dropout': 0.12174914809192425, 
'lr': 0.009686775467334081, 
'hidden_1': 128, 
'hidden_2': 16, 
'hidden_3': 32
}
Mejor Val RMSE: 0.8821

In [21]:
# Reutilizamos el modelo FullHybridNCF
train_centered_model(full_model, train_loader, val_loader, epochs=10, lr=0.001, device=device)


Epoch 1/10: 100%|██████████| 2814/2814 [00:27<00:00, 102.80it/s]


Epoch 1: Train MSE = 0.9400, Val MSE = 0.8638, Val RMSE = 0.9294
✅ Mejor modelo guardado.


Epoch 2/10: 100%|██████████| 2814/2814 [00:26<00:00, 107.96it/s]


Epoch 2: Train MSE = 0.8548, Val MSE = 0.8343, Val RMSE = 0.9134
✅ Mejor modelo guardado.


Epoch 3/10: 100%|██████████| 2814/2814 [00:27<00:00, 102.43it/s]


Epoch 3: Train MSE = 0.8207, Val MSE = 0.8140, Val RMSE = 0.9022
✅ Mejor modelo guardado.


Epoch 4/10: 100%|██████████| 2814/2814 [00:27<00:00, 103.30it/s]


Epoch 4: Train MSE = 0.7987, Val MSE = 0.8103, Val RMSE = 0.9001
✅ Mejor modelo guardado.


Epoch 5/10: 100%|██████████| 2814/2814 [00:27<00:00, 100.70it/s]


Epoch 5: Train MSE = 0.7809, Val MSE = 0.8073, Val RMSE = 0.8985
✅ Mejor modelo guardado.


Epoch 6/10: 100%|██████████| 2814/2814 [00:28<00:00, 99.46it/s] 


Epoch 6: Train MSE = 0.7629, Val MSE = 0.8075, Val RMSE = 0.8986


Epoch 7/10: 100%|██████████| 2814/2814 [00:27<00:00, 101.82it/s]


Epoch 7: Train MSE = 0.7422, Val MSE = 0.8229, Val RMSE = 0.9072


Epoch 8/10: 100%|██████████| 2814/2814 [00:28<00:00, 97.43it/s] 


Epoch 8: Train MSE = 0.7184, Val MSE = 0.8320, Val RMSE = 0.9122


Epoch 9/10: 100%|██████████| 2814/2814 [00:28<00:00, 99.00it/s] 


Epoch 9: Train MSE = 0.6923, Val MSE = 0.8389, Val RMSE = 0.9159


Epoch 10/10: 100%|██████████| 2814/2814 [00:28<00:00, 98.73it/s] 


Epoch 10: Train MSE = 0.6647, Val MSE = 0.8587, Val RMSE = 0.9266
🛑 Early stopping activado. No hay mejora en la validación.
🎯 Entrenamiento finalizado.


In [24]:
#load best model
full_model.load_state_dict(torch.load("v9_best_model.pth"))
full_model.eval()

FullHybridNCF_BN(
  (user_embedding_gmf): Embedding(6040, 64)
  (item_embedding_gmf): Embedding(3706, 64)
  (user_embedding_mlp): Embedding(6040, 64)
  (item_embedding_mlp): Embedding(3706, 64)
  (gender_embedding): Embedding(2, 4)
  (age_embedding): Embedding(7, 4)
  (occ_embedding): Embedding(21, 4)
  (genre_dense): Linear(in_features=18, out_features=8, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=148, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.12174914809192425, inplace=False)
    (4): Linear(in_features=128, out_features=16, bias=True)
    (5): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.12174914809192425, inplace=False)
    (8): Linear(in_features=16, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()

In [15]:
# Dataset para test con ratings normalizados
test_dataset = CenteredHybridDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=256)


In [16]:
import optuna
import torch.nn.functional as F
from torch.nn import MSELoss
from tqdm import tqdm

def objective(trial):
    # Hiperparámetros a optimizar
    embedding_dim = trial.suggest_categorical("embedding_dim", [32, 64, 128, 256])
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    hidden_1 = trial.suggest_categorical("hidden_1", [128, 64, 32])
    hidden_2 = trial.suggest_categorical("hidden_2", [64, 32, 16])
    hidden_3 = trial.suggest_categorical("hidden_3", [32, 16, 8])
    mlp_layers = [hidden_1, hidden_2, hidden_3]

    # Modelo
    model = FullHybridNCF_BN(
        num_users=num_users,
        num_items=num_movies,
        num_genders=ratings_full['gender'].nunique(),
        num_ages=ratings_full['age'].nunique(),
        num_occs=ratings_full['occupation'].nunique(),
        num_genres=len(genre_to_idx),
        embedding_dim=embedding_dim,
        mlp_layers=mlp_layers,
        dropout=dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = MSELoss()

    # Entrenamiento breve (por ejemplo 3 épocas para eficiencia)
    for epoch in range(3):
        model.train()
        for user, item, gender, age, occ, genre_vec, centered_rating, user_mean in train_loader:
            user, item = user.to(device), item.to(device)
            gender, age, occ = gender.to(device), age.to(device), occ.to(device)
            genre_vec = genre_vec.to(device)
            centered_rating = centered_rating.to(device)
            user_mean = user_mean.to(device)

            optimizer.zero_grad()
            pred_deviation = model(user, item, gender, age, occ, genre_vec)
            pred_rating = pred_deviation + user_mean
            true_rating = centered_rating + user_mean
            loss = criterion(pred_rating, true_rating)
            loss.backward()
            optimizer.step()

    # Validación RMSE real
    model.eval()
    val_losses = []
    with torch.no_grad():
        for user, item, gender, age, occ, genre_vec, centered_rating, user_mean in val_loader:
            user, item = user.to(device), item.to(device)
            gender, age, occ = gender.to(device), age.to(device), occ.to(device)
            genre_vec = genre_vec.to(device)
            centered_rating = centered_rating.to(device)
            user_mean = user_mean.to(device)

            pred_deviation = model(user, item, gender, age, occ, genre_vec)
            pred_rating = pred_deviation + user_mean
            true_rating = centered_rating + user_mean
            mse = F.mse_loss(pred_rating, true_rating, reduction='mean').item()
            val_losses.append(mse)

    return np.sqrt(np.mean(val_losses))  # Val RMSE


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # puedes aumentar a 50–100 si tienes tiempo


[I 2025-04-06 11:21:14,805] A new study created in memory with name: no-name-638da2ad-ea21-485c-814a-1c5bad16a08f
[I 2025-04-06 11:22:36,994] Trial 0 finished with value: 0.9147953333171137 and parameters: {'embedding_dim': 128, 'dropout': 0.3126001784669892, 'lr': 0.0004158338910806446, 'hidden_1': 64, 'hidden_2': 64, 'hidden_3': 8}. Best is trial 0 with value: 0.9147953333171137.
[I 2025-04-06 11:23:59,802] Trial 1 finished with value: 0.9012294176749325 and parameters: {'embedding_dim': 32, 'dropout': 0.1848550390749415, 'lr': 0.001535131268699841, 'hidden_1': 32, 'hidden_2': 32, 'hidden_3': 8}. Best is trial 1 with value: 0.9012294176749325.
[I 2025-04-06 11:25:21,912] Trial 2 finished with value: 0.9445262887121724 and parameters: {'embedding_dim': 128, 'dropout': 0.3724439061768585, 'lr': 0.00012221457053931107, 'hidden_1': 128, 'hidden_2': 64, 'hidden_3': 32}. Best is trial 1 with value: 0.9012294176749325.
[I 2025-04-06 11:26:41,299] Trial 3 finished with value: 0.9000528549371

In [18]:
print("Mejores hiperparámetros:")
print(study.best_params)
print(f"Mejor Val RMSE: {study.best_value:.4f}")


Mejores hiperparámetros:
{'embedding_dim': 64, 'dropout': 0.12174914809192425, 'lr': 0.009686775467334081, 'hidden_1': 128, 'hidden_2': 16, 'hidden_3': 32}
Mejor Val RMSE: 0.8821


In [22]:
def evaluate_centered_model(model, test_loader, device='cpu'):
    model.eval()
    model.to(device)

    all_preds = []
    all_true = []

    with torch.no_grad():
        for user, item, gender, age, occ, genre_vec, centered_rating, user_mean in test_loader:
            user, item = user.to(device), item.to(device)
            gender, age, occ = gender.to(device), age.to(device), occ.to(device)
            genre_vec = genre_vec.to(device)
            user_mean = user_mean.to(device)
            centered_rating = centered_rating.to(device)

            pred_deviation = model(user, item, gender, age, occ, genre_vec)
            pred_rating = pred_deviation + user_mean
            true_rating = centered_rating + user_mean

            all_preds.extend(pred_rating.cpu().numpy())
            all_true.extend(true_rating.cpu().numpy())

    all_preds = np.array(all_preds)
    all_true = np.array(all_true)

    mse = np.mean((all_preds - all_true) ** 2)
    rmse = np.sqrt(mse)
    print(f"Test RMSE: {rmse:.4f}")
    return rmse


In [25]:
evaluate_centered_model(model=full_model, test_loader=test_loader, device=device)


Test RMSE: 0.9051


np.float32(0.9051454)