In [26]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


import torch.nn as nn
import torch.optim as optim

import numpy as np

#importar F.leaky_relu
import torch.nn.functional as F
import matplotlib.pyplot as plt

## Import the movieLens dataset




In [27]:
import pandas as pd

# Rutas de los archivos
movies_path = "ml-1m/movies.dat"
ratings_path = "ml-1m/ratings.dat"
users_path = "ml-1m/users.dat"

# Carga de los datos
users = pd.read_csv(users_path, sep="::", engine="python", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], encoding="latin-1")
movies = pd.read_csv(movies_path, sep="::", engine="python", names=["MovieID", "Title", "Genres"], encoding="latin-1")
ratings = pd.read_csv(ratings_path, sep="::", engine="python", names=["UserID", "MovieID", "Rating", "Timestamp"], encoding="latin-1")

# Mostrar primeras filas para verificar
print("Ratings:")
print(ratings.head())
print("\nMovies:")
print(movies.head())
print("\nUsers:")
print(users.head())


Ratings:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Movies:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Users:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


In [28]:
# Convertir géneros a lista de géneros
movies['Genres'] = movies['Genres'].apply(lambda x: x.split('|'))

# Verificar cambios
print(movies.head())


   MovieID                               Title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                             Genres  
0   [Animation, Children's, Comedy]  
1  [Adventure, Children's, Fantasy]  
2                 [Comedy, Romance]  
3                   [Comedy, Drama]  
4                          [Comedy]  


In [29]:
# Codificar los UserID y MovieID a índices consecutivos
user2idx = {user_id: idx for idx, user_id in enumerate(users["UserID"].unique())}
movie2idx = {movie_id: idx for idx, movie_id in enumerate(movies["MovieID"].unique())}

# Aplicar el mapeo a ratings
ratings["UserID"] = ratings["UserID"].map(user2idx)
ratings["MovieID"] = ratings["MovieID"].map(movie2idx)


# Guardar número total de usuarios y películas
num_users = len(user2idx)
num_movies = len(movie2idx)

print(f"Total usuarios: {num_users}, Total películas: {num_movies}")
print(ratings.head())




Total usuarios: 6040, Total películas: 3883
   UserID  MovieID  Rating  Timestamp
0       0     1176       5  978300760
1       0      655       3  978302109
2       0      902       3  978301968
3       0     3339       4  978300275
4       0     2286       5  978824291


## Dividir en Train / Validation / Test

    Train (70%) → Para entrenar el modelo.

    Validation (15%) → Para ajustar hiperparámetros.

    Test (15%) → Para evaluar el modelo final.

In [30]:
from sklearn.model_selection import train_test_split

# Dividir los datos en conjunto de entrenamiento (70%), validación (15%) y prueba (15%)
train_data, temp_data = train_test_split(ratings, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Tamaño Train: {len(train_data)}, Validación: {len(val_data)}, Test: {len(test_data)}")


Tamaño Train: 700146, Validación: 150031, Test: 150032


## Crear PyTorch Dataset y DataLoader



In [31]:
import torch
from torch.utils.data import Dataset, DataLoader

# Dataset personalizado para MovieLens 1M
class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["UserID"].values, dtype=torch.long)
        self.movies = torch.tensor(df["MovieID"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["Rating"].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

# Crear datasets para entrenamiento, validación y prueba
train_dataset = MovieLensDataset(train_data)
val_dataset = MovieLensDataset(val_data)
test_dataset = MovieLensDataset(test_data)

# Crear DataLoaders para cargar los datos en lotes
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Verificar que los DataLoaders funcionan correctamente
for users, movies, ratings in train_loader:
    print("Batch de usuarios:", users[:5])
    print("Batch de películas:", movies[:5])
    print("Batch de ratings:", ratings[:5])
    break


Batch de usuarios: tensor([4479, 4504, 3490, 2319, 3609])
Batch de películas: tensor([2338, 1906, 2204, 1503, 1797])
Batch de ratings: tensor([3., 4., 3., 4., 4.])


In [32]:
class ImprovedRecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64):
        super(ImprovedRecommenderNet, self).__init__()

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        # Inicialización de pesos mejorada
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.movie_embedding.weight)

        # Capas densas con más capacidad y regularización
        self.fc1 = nn.Linear(embedding_dim * 2, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)

    def forward(self, user_ids, movie_ids):
        user_vec = self.user_embedding(user_ids)
        movie_vec = self.movie_embedding(movie_ids)
        x = torch.cat([user_vec, movie_vec], dim=1)

        x = self.dropout1(F.leaky_relu(self.bn1(self.fc1(x))))
        x = self.dropout2(F.leaky_relu(self.bn2(self.fc2(x))))
        x = F.leaky_relu(self.fc3(x))
        x = self.output(x)
        return x.squeeze()


Entrenar el Modelo


In [33]:
import torch.optim as optim
from sklearn.metrics import mean_squared_error

# Inicializamos el modelo
model = ImprovedRecommenderNet(num_users, num_movies, embedding_dim=64)

# Definir la función de pérdida y el optimizador
criterion = nn.MSELoss()  # Error cuadrático medio
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)

# Función de entrenamiento con Early Stopping
def train_model_early_stopping(model, train_loader, val_loader, criterion, optimizer, epochs=30, patience=5):
    best_val_loss = float('inf')  # Mantener la mejor pérdida de validación
    epochs_without_improvement = 0  # Contador de épocas sin mejora
    
    for epoch in range(epochs):
        model.train()  # Poner el modelo en modo de entrenamiento
        running_loss = 0.0
        
        for users, movies, ratings in train_loader:
            optimizer.zero_grad()  # Limpiar los gradientes
            
            # Hacer las predicciones
            predictions = model(users, movies)
            
            # Calcular la pérdida
            loss = criterion(predictions, ratings)
            
            # Hacer backpropagation y actualizar los pesos
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        # Promediar la pérdida del entrenamiento
        avg_train_loss = running_loss / len(train_loader)
        
        # Validación
        model.eval()  # Modo evaluación
        val_loss = 0.0
        
        with torch.no_grad():
            for users, movies, ratings in val_loader:
                predictions = model(users, movies)
                loss = criterion(predictions, ratings)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        
        # Imprimir resultados
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f} - Validation Loss: {avg_val_loss:.4f}")
        
        # Guardar el modelo si la pérdida de validación mejora
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print("Mejor modelo guardado.")
            epochs_without_improvement = 0  # Resetear el contador
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("No hubo mejora en la pérdida de validación, deteniendo el entrenamiento.")
                break
    
    print("Entrenamiento completado.")
    return model


# Entrenar el modelo
trained_model = train_model_early_stopping(model, train_loader, val_loader, criterion, optimizer, epochs=30)


Epoch 1/30 - Train Loss: 0.9849 - Validation Loss: 0.8676
Mejor modelo guardado.
Epoch 2/30 - Train Loss: 0.8401 - Validation Loss: 0.8172
Mejor modelo guardado.
Epoch 3/30 - Train Loss: 0.8084 - Validation Loss: 0.8050
Mejor modelo guardado.
Epoch 4/30 - Train Loss: 0.7930 - Validation Loss: 0.7947
Mejor modelo guardado.
Epoch 5/30 - Train Loss: 0.7824 - Validation Loss: 0.7940
Mejor modelo guardado.
Epoch 6/30 - Train Loss: 0.7744 - Validation Loss: 0.7836
Mejor modelo guardado.
Epoch 7/30 - Train Loss: 0.7648 - Validation Loss: 0.7734
Mejor modelo guardado.
Epoch 8/30 - Train Loss: 0.7554 - Validation Loss: 0.7734
Epoch 9/30 - Train Loss: 0.7470 - Validation Loss: 0.7762
Epoch 10/30 - Train Loss: 0.7405 - Validation Loss: 0.7639
Mejor modelo guardado.
Epoch 11/30 - Train Loss: 0.7352 - Validation Loss: 0.7689
Epoch 12/30 - Train Loss: 0.7308 - Validation Loss: 0.7621
Mejor modelo guardado.
Epoch 13/30 - Train Loss: 0.7250 - Validation Loss: 0.7635
Epoch 14/30 - Train Loss: 0.7217 - 

In [36]:
#cargar el modelo guardado best_model.pth

best_model = ImprovedRecommenderNet(num_users, num_movies, embedding_dim=64)
best_model.load_state_dict(torch.load('best_model.pth'))
best_model.eval()  # Modo evaluación



ImprovedRecommenderNet(
  (user_embedding): Embedding(6040, 64)
  (movie_embedding): Embedding(3883, 64)
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (output): Linear(in_features=64, out_features=1, bias=True)
)

In [40]:
def evaluate_model(model, test_loader, criterion):
    model.eval()  # Establecer el modelo en modo evaluación
    test_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for users, movies, ratings in test_loader:
            # Hacer las predicciones
            predictions = model(users, movies)
            
            # Calcular la pérdida
            loss = criterion(predictions, ratings)
            test_loss += loss.item()
            
            # Almacenar las predicciones y las etiquetas reales para calcular métricas
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(ratings.cpu().numpy())
    
    avg_test_loss = test_loss / len(test_loader)
    mse = mean_squared_error(all_labels, all_preds)
    print(f"Test Loss: {avg_test_loss:.4f} - MSE: {mse:.4f}")
    
    return mse

# Evaluar el modelo
test_mse = evaluate_model(trained_model, test_loader, criterion)


Test Loss: 0.7511 - MSE: 0.7509


In [43]:
#calcular RMSE

rmse = np.sqrt(test_mse)

print(f"RMSE: {rmse:.4f}")

RMSE: 0.8666
