In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import numpy as np


In [3]:
# Librerías
import pandas as pd
import numpy as np


movies_path = "ml-1m/movies.dat"
ratings_path = "ml-1m/ratings.dat"
users_path = "ml-1m/users.dat"


# Carga de los datos
users = pd.read_csv(users_path, sep="::", engine="python", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], encoding="latin-1")
movies = pd.read_csv(movies_path, sep="::", engine="python", names=["MovieID", "Title", "Genres"], encoding="latin-1")
ratings = pd.read_csv(ratings_path, sep="::", engine="python", names=["UserID", "MovieID", "Rating", "Timestamp"], encoding="latin-1")

# Mostrar primeras filas
ratings.head()



Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
class UserAutoRec(nn.Module):
    def __init__(self, num_items, hidden_dim=500, dropout=0.5):
        super(UserAutoRec, self).__init__()
        self.encoder = nn.Linear(num_items, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Linear(hidden_dim, num_items)

    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        encoded = self.dropout(encoded)
        decoded = self.decoder(encoded)
        return decoded


In [5]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['UserID'])
ratings['movie'] = movie_encoder.fit_transform(ratings['MovieID'])

# Guardamos número de usuarios y películas únicos
num_users = ratings['user'].nunique()
num_movies = ratings['movie'].nunique()

num_users, num_movies


(6040, 3706)

In [6]:
# Obtener lista única de géneros
movies['Genres'] = movies['Genres'].str.split('|')
all_genres = sorted(set(g for sublist in movies['Genres'] for g in sublist))
genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}

# Crear codificación multi-hot para cada película
def encode_genres(genres_list):
    multi_hot = np.zeros(len(genre_to_idx), dtype=np.float32)
    for genre in genres_list:
        if genre in genre_to_idx:
            multi_hot[genre_to_idx[genre]] = 1.0
    return multi_hot

movies['genre_vector'] = movies['Genres'].apply(encode_genres)


In [7]:
from sklearn.preprocessing import LabelEncoder

# Fusionar ratings con info del usuario
ratings_full = ratings.merge(users, on="UserID")

# Codificar gender (F/M → 0/1 por ejemplo)
gender_encoder = LabelEncoder()
ratings_full['gender'] = gender_encoder.fit_transform(ratings_full['Gender'])

# Codificar age y occupation como categorías
age_encoder = LabelEncoder()
occ_encoder = LabelEncoder()

ratings_full['age'] = age_encoder.fit_transform(ratings_full['Age'])
ratings_full['occupation'] = occ_encoder.fit_transform(ratings_full['Occupation'])

# Mostrar las clases para documentar
print("Géneros:", gender_encoder.classes_)
print("Edades:", age_encoder.classes_)
print("Ocupaciones:", occ_encoder.classes_)

ratings_full[['user', 'movie', 'Rating', 'gender', 'age', 'occupation']].head()


Géneros: ['F' 'M']
Edades: [ 1 18 25 35 45 50 56]
Ocupaciones: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


Unnamed: 0,user,movie,Rating,gender,age,occupation
0,0,1104,5,0,0,10
1,0,639,3,0,0,10
2,0,853,3,0,0,10
3,0,3177,4,0,0,10
4,0,2162,5,0,0,10


In [8]:
ratings_full = ratings_full.merge(movies[['MovieID', 'genre_vector']], on='MovieID')

ratings_full.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,user,movie,Gender,Age,Occupation,Zip-code,gender,age,occupation,genre_vector
0,1,1193,5,978300760,0,1104,F,1,10,48067,0,0,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,1,661,3,978302109,0,639,F,1,10,48067,0,0,10,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,914,3,978301968,0,853,F,1,10,48067,0,0,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,3408,4,978300275,0,3177,F,1,10,48067,0,0,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1,2355,5,978824291,0,2162,F,1,10,48067,0,0,10,"[0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
# Calcular media por usuario
user_mean_rating = ratings_full.groupby('user')['Rating'].mean().to_dict()

# Crear nueva columna con rating centrado
ratings_full['rating_centered'] = ratings_full.apply(
    lambda row: row['Rating'] - user_mean_rating[row['user']], axis=1
)


In [10]:
import torch
import numpy as np
import pandas as pd

# Crear una matriz usuario-item donde filas = usuarios, columnas = películas
num_users = ratings_full['user'].nunique()
num_items = ratings_full['movie'].nunique()

# Crear una matriz llena de ceros
interaction_matrix = np.zeros((num_users, num_items), dtype=np.float32)

# Rellenar con los ratings reales
for row in ratings_full.itertuples():
    interaction_matrix[row.user, row.movie] = row.Rating

# Convertir a tensor para usar en PyTorch
interaction_tensor = torch.tensor(interaction_matrix)

interaction_tensor.shape


torch.Size([6040, 3706])

In [11]:
class FullHybridNCF(nn.Module):
    def __init__(self, num_users, num_items, num_genders, num_ages, num_occs, num_genres,
                 embedding_dim=32, mlp_layers=[64, 32, 16], dropout=0.3):
        super(FullHybridNCF, self).__init__()

        # Embeddings usuario-película
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, embedding_dim)

        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)

        # Embeddings para metadata usuario
        self.gender_embedding = nn.Embedding(num_genders, 4)
        self.age_embedding = nn.Embedding(num_ages, 4)
        self.occ_embedding = nn.Embedding(num_occs, 4)

        # Proyección densa para género multi-hot
        self.genre_dense = nn.Linear(num_genres, 8)

        # MLP
        mlp_input_size = 2 * embedding_dim + 3 * 4 + 8  # user + item + gender/age/occ + genre info
        mlp_layers_seq = []
        for layer_size in mlp_layers:
            mlp_layers_seq.append(nn.Linear(mlp_input_size, layer_size))
            mlp_layers_seq.append(nn.ReLU())
            mlp_layers_seq.append(nn.Dropout(dropout))
            mlp_input_size = layer_size
        self.mlp = nn.Sequential(*mlp_layers_seq)

        # Capa final combinada
        self.final_layer = nn.Linear(embedding_dim + mlp_layers[-1], 1)

    def forward(self, user_ids, item_ids, gender_ids, age_ids, occ_ids, genre_vecs):
        # GMF
        gmf_user = self.user_embedding_gmf(user_ids)
        gmf_item = self.item_embedding_gmf(item_ids)
        gmf_output = gmf_user * gmf_item

        # MLP con metadata
        mlp_user = self.user_embedding_mlp(user_ids)
        mlp_item = self.item_embedding_mlp(item_ids)
        gender_emb = self.gender_embedding(gender_ids)
        age_emb = self.age_embedding(age_ids)
        occ_emb = self.occ_embedding(occ_ids)
        genre_proj = self.genre_dense(genre_vecs)

        mlp_input = torch.cat([mlp_user, mlp_item, gender_emb, age_emb, occ_emb, genre_proj], dim=-1)
        mlp_output = self.mlp(mlp_input)

        # Concatenar GMF y MLP
        final_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.final_layer(final_input)
        return output.squeeze()


In [12]:
from torch.utils.data import Dataset, DataLoader

class AutoRecDataset(Dataset):
    def __init__(self, rating_matrix):
        self.ratings = rating_matrix
        self.masks = (rating_matrix > 0).float()  # Máscara de entradas válidas

    def __len__(self):
        return self.ratings.shape[0]

    def __getitem__(self, idx):
        return self.ratings[idx], self.masks[idx]


In [13]:
from sklearn.model_selection import train_test_split

# Dataset exclusivo para AutoRec
autorec_dataset = AutoRecDataset(interaction_tensor)

# Split (train/val)
train_size = int(0.8 * len(autorec_dataset))
val_size = len(autorec_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(autorec_dataset, [train_size, val_size])

# Loaders
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)





In [14]:
def train_autorec(model, train_loader, val_loader, epochs=10, lr=0.001, device='cpu'):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    criterion = torch.nn.MSELoss(reduction='none')  # necesitamos controlar qué partes del vector evaluar

    for epoch in range(epochs):
        model.train()
        train_losses = []

        for ratings, masks in train_loader:
            ratings, masks = ratings.to(device), masks.to(device)
            outputs = model(ratings)
            loss_matrix = criterion(outputs, ratings)
            masked_loss = (loss_matrix * masks).sum() / masks.sum()

            optimizer.zero_grad()
            masked_loss.backward()
            optimizer.step()
            train_losses.append(masked_loss.item())

        # Validación
        model.eval()
        val_losses = []
        with torch.no_grad():
            for ratings, masks in val_loader:
                ratings, masks = ratings.to(device), masks.to(device)
                outputs = model(ratings)
                loss_matrix = criterion(outputs, ratings)
                masked_loss = (loss_matrix * masks).sum() / masks.sum()
                val_losses.append(masked_loss.item())

        print(f"Epoch {epoch+1}: Train RMSE = {np.sqrt(np.mean(train_losses)):.4f}, Val RMSE = {np.sqrt(np.mean(val_losses)):.4f}")


In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [26]:
# Modelo con hiperparámetros óptimos
best_model = UserAutoRec(num_items=num_items, hidden_dim=805, dropout=0.1022).to(device)

# Optimizador con weight decay y learning rate óptimos
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.0014, weight_decay=9.43e-5)

# Entrenar el modelo
train_autorec(
    model=best_model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=30,
    lr=0.0014,
    device=device
)


Epoch 1: Train RMSE = 2.4706, Val RMSE = 1.6833
Epoch 2: Train RMSE = 1.5384, Val RMSE = 1.3731
Epoch 3: Train RMSE = 1.2890, Val RMSE = 1.2605
Epoch 4: Train RMSE = 1.1971, Val RMSE = 1.2164
Epoch 5: Train RMSE = 1.0971, Val RMSE = 1.1796
Epoch 6: Train RMSE = 1.0467, Val RMSE = 1.1616
Epoch 7: Train RMSE = 1.0096, Val RMSE = 1.1649
Epoch 8: Train RMSE = 0.9859, Val RMSE = 1.1557
Epoch 9: Train RMSE = 0.9629, Val RMSE = 1.1709
Epoch 10: Train RMSE = 0.9274, Val RMSE = 1.1257
Epoch 11: Train RMSE = 0.9240, Val RMSE = 1.1181
Epoch 12: Train RMSE = 0.8912, Val RMSE = 1.1135
Epoch 13: Train RMSE = 0.8672, Val RMSE = 1.1195
Epoch 14: Train RMSE = 0.8982, Val RMSE = 1.1363
Epoch 15: Train RMSE = 0.8709, Val RMSE = 1.1064
Epoch 16: Train RMSE = 0.8639, Val RMSE = 1.1196
Epoch 17: Train RMSE = 0.8764, Val RMSE = 1.0970
Epoch 18: Train RMSE = 0.8274, Val RMSE = 1.0910
Epoch 19: Train RMSE = 0.7867, Val RMSE = 1.1055
Epoch 20: Train RMSE = 0.7541, Val RMSE = 1.0883
Epoch 21: Train RMSE = 0.7393

In [23]:
import optuna

def objective(trial):
    # Hiperparámetros a buscar
    hidden_dim = trial.suggest_int("hidden_dim", 100, 1000)
    dropout = trial.suggest_float("dropout", 0.1, 0.7)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)

    # Modelo
    model = UserAutoRec(num_items=num_items, hidden_dim=hidden_dim, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = torch.nn.MSELoss(reduction='none')

    # Entrenamiento (solo unas pocas épocas para rapidez)
    for epoch in range(5):  # puedes subirlo más si quieres
        model.train()
        for ratings, masks in train_loader:
            ratings, masks = ratings.to(device), masks.to(device)
            outputs = model(ratings)
            loss_matrix = criterion(outputs, ratings)
            loss = (loss_matrix * masks).sum() / masks.sum()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluación en val
    model.eval()
    total_loss, total_count = 0, 0
    with torch.no_grad():
        for ratings, masks in val_loader:
            ratings, masks = ratings.to(device), masks.to(device)
            outputs = model(ratings)
            loss_matrix = criterion(outputs, ratings)
            total_loss += (loss_matrix * masks).sum().item()
            total_count += masks.sum().item()

    rmse = np.sqrt(total_loss / total_count)
    return rmse


  from .autonotebook import tqdm as notebook_tqdm


In [24]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # Puedes subir a 50+ para más calidad


[I 2025-04-05 13:33:38,147] A new study created in memory with name: no-name-37fd783b-5dfd-4987-ba47-2ef9c2190916
[I 2025-04-05 13:33:38,937] Trial 0 finished with value: 1.2947954868808795 and parameters: {'hidden_dim': 989, 'dropout': 0.6241261028857701, 'weight_decay': 3.161644711833387e-05, 'lr': 0.0009572100211210147}. Best is trial 0 with value: 1.2947954868808795.
[I 2025-04-05 13:33:39,632] Trial 1 finished with value: 1.2698513392705157 and parameters: {'hidden_dim': 540, 'dropout': 0.46976185882164456, 'weight_decay': 0.00016007989798987686, 'lr': 0.0006347021940243749}. Best is trial 1 with value: 1.2698513392705157.
[I 2025-04-05 13:33:40,455] Trial 2 finished with value: 1.4434434588852854 and parameters: {'hidden_dim': 784, 'dropout': 0.43666374587598833, 'weight_decay': 7.0302816245178206e-06, 'lr': 0.00019480885697191228}. Best is trial 1 with value: 1.2698513392705157.
[I 2025-04-05 13:33:41,161] Trial 3 finished with value: 1.3294331866346671 and parameters: {'hidden_

In [25]:
print("Mejores hiperparámetros:")
print(study.best_params)
print(f"Mejor RMSE en validación: {study.best_value:.4f}")


Mejores hiperparámetros:
{'hidden_dim': 805, 'dropout': 0.10217863295203225, 'weight_decay': 9.431471314587107e-05, 'lr': 0.0013953148195708519}
Mejor RMSE en validación: 1.1600


In [21]:
# Obtener directamente el test set desde el split original
test_dataset = val_dataset  # si estás usando el 80/20 split como train/val/test

test_loader = DataLoader(test_dataset, batch_size=256)


In [22]:
def evaluate_autorec(model, test_loader, device='cpu'):
    model.eval()
    model.to(device)

    total_loss = 0
    total_count = 0
    criterion = torch.nn.MSELoss(reduction='none')

    with torch.no_grad():
        for ratings, masks in test_loader:
            ratings, masks = ratings.to(device), masks.to(device)
            outputs = model(ratings)
            loss_matrix = criterion(outputs, ratings)
            masked_loss = (loss_matrix * masks)
            total_loss += masked_loss.sum().item()
            total_count += masks.sum().item()

    rmse = np.sqrt(total_loss / total_count)
    print(f"Test RMSE: {rmse:.4f}")
    return rmse


In [27]:
evaluate_autorec(model=autorec_model, test_loader=test_loader, device=device)


Test RMSE: 1.0952


np.float64(1.0951627780456505)

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.scatterplot(x=hybrid_truth, y=hybrid_preds, alpha=0.3)
plt.xlabel("Rating real")
plt.ylabel("Rating predicho (modelo híbrido)")
plt.title("Ratings reales vs predichos - Modelo Híbrido")
plt.grid(True)
plt.show()


NameError: name 'hybrid_truth' is not defined

<Figure size 800x600 with 0 Axes>