In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error


## Import the movieLens dataset




In [6]:
import pandas as pd

# Rutas de los archivos
movies_path = "../ml-1m/movies.dat"
ratings_path = "../ml-1m/ratings.dat"
users_path = "../ml-1m/users.dat"

# Carga de los datos
# Cargar archivos
users = pd.read_csv("../ml-1m/users.dat", sep="::", engine="python", 
                    names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], encoding="latin-1")

movies = pd.read_csv("../ml-1m/movies.dat", sep="::", engine="python", 
                     names=["MovieID", "Title", "Genres"], encoding="latin-1")

ratings = pd.read_csv("../ml-1m/ratings.dat", sep="::", engine="python", 
                      names=["UserID", "MovieID", "Rating", "Timestamp"], encoding="latin-1")

# Mostrar primeras filas para verificar
print("Ratings:")
print(ratings.head())
print("\nMovies:")
print(movies.head())
print("\nUsers:")
print(users.head())


Ratings:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Movies:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Users:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


In [7]:
# Normalizar ratings de 1-5 a 0-1
ratings["Rating"] = (ratings["Rating"] - 1.0) / 4.0

# Convertir géneros a listas
movies["Genres"] = movies["Genres"].apply(lambda x: x.split("|"))

# Codificar IDs
user2idx = {user_id: idx for idx, user_id in enumerate(users["UserID"].unique())}
movie2idx = {movie_id: idx for idx, movie_id in enumerate(movies["MovieID"].unique())}

ratings["UserID"] = ratings["UserID"].map(user2idx)
ratings["MovieID"] = ratings["MovieID"].map(movie2idx)

num_users = len(user2idx)
num_movies = len(movie2idx)

print(f"Total usuarios: {num_users}, Total películas: {num_movies}")


Total usuarios: 6040, Total películas: 3883


## Dividir en Train / Validation / Test

    Train (70%) → Para entrenar el modelo.

    Validation (15%) → Para ajustar hiperparámetros.

    Test (15%) → Para evaluar el modelo final.

In [8]:
# División: 70% Train, 15% Val, 15% Test
train_data, temp_data = train_test_split(ratings, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Tamaño Train: {len(train_data)}, Validación: {len(val_data)}, Test: {len(test_data)}")


Tamaño Train: 700146, Validación: 150031, Test: 150032


## Crear PyTorch Dataset y DataLoader



In [9]:
class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["UserID"].values, dtype=torch.long)
        self.movies = torch.tensor(df["MovieID"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["Rating"].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

# Instanciar datasets
train_dataset = MovieLensDataset(train_data)
val_dataset = MovieLensDataset(val_data)
test_dataset = MovieLensDataset(test_data)

# Loaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [10]:
class ImprovedRecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64):
        super(ImprovedRecommenderNet, self).__init__()

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        # Inicialización de pesos mejorada
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.movie_embedding.weight)

        # Capas densas con más capacidad y regularización
        self.fc1 = nn.Linear(embedding_dim * 2, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)

    def forward(self, user_ids, movie_ids):
        user_vec = self.user_embedding(user_ids)
        movie_vec = self.movie_embedding(movie_ids)
        x = torch.cat([user_vec, movie_vec], dim=1)

        x = self.dropout1(F.leaky_relu(self.bn1(self.fc1(x))))
        x = self.dropout2(F.leaky_relu(self.bn2(self.fc2(x))))
        x = F.leaky_relu(self.fc3(x))
        x = self.output(x)
        return x.squeeze()


Entrenar el Modelo


In [11]:
class DeeperRecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=128):
        super(DeeperRecommenderNet, self).__init__()

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        # Inicialización Xavier
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.movie_embedding.weight)

        self.fc1 = nn.Linear(embedding_dim * 2, 512)
        self.dropout1 = nn.Dropout(0.4)

        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, 128)
        self.dropout3 = nn.Dropout(0.2)

        self.output = nn.Linear(128, 1)

    def forward(self, user_ids, movie_ids):
        user_vec = self.user_embedding(user_ids)
        movie_vec = self.movie_embedding(movie_ids)

        x = torch.cat([user_vec, movie_vec], dim=1)

        x = F.relu(self.fc1(x))
        x = self.dropout1(x)

        x = F.relu(self.fc2(x))
        x = self.dropout2(x)

        x = F.relu(self.fc3(x))
        x = self.dropout3(x)

        x = self.output(x)
        return x.squeeze()


In [12]:
# Crear instancia del modelo
model = DeeperRecommenderNet(num_users, num_movies, embedding_dim=128)

# Función de pérdida
criterion = nn.MSELoss()

# Optimizador con weight decay para regularización L2
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)


In [13]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50, patience=5, clip_value=1.0):
    best_val_loss = float('inf')
    epochs_without_improvement = 0
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        
        for users, movies, ratings in train_loader:
            optimizer.zero_grad()
            predictions = model(users, movies)
            loss = criterion(predictions, ratings)
            loss.backward()
            
            # Clip de gradientes
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_value)
            
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)

        # Validación
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for users, movies, ratings in val_loader:
                predictions = model(users, movies)
                loss = criterion(predictions, ratings)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pth")
            print("🟢 Mejor modelo guardado.")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("🛑 Early stopping: no mejora en validación.")
                break

    print("✅ Entrenamiento finalizado.")
    return model


In [16]:
# Entrenar
trained_model = train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50)


Epoch 1/50 | Train Loss: 0.0590 | Val Loss: 0.0536
🟢 Mejor modelo guardado.


KeyboardInterrupt: 

In [21]:
# Crear una nueva instancia y cargar pesos
best_model = DeeperRecommenderNet(num_users, num_movies, embedding_dim=128)
best_model.load_state_dict(torch.load("best_model.pth"))
best_model.eval()


DeeperRecommenderNet(
  (user_embedding): Embedding(6040, 128)
  (movie_embedding): Embedding(3883, 128)
  (fc1): Linear(in_features=256, out_features=512, bias=True)
  (dropout1): Dropout(p=0.4, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for users, movies, ratings in test_loader:
            predictions = model(users, movies)
            loss = criterion(predictions, ratings)
            test_loss += loss.item()

            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(ratings.cpu().numpy())

    avg_loss = test_loss / len(test_loader)

    # Convertir a numpy arrays
    all_preds = np.clip(np.array(all_preds), 0.0, 1.0)  # clipping
    all_labels = np.array(all_labels)

    # Escala original [1–5]
    preds_original = all_preds * 4 + 1
    labels_original = all_labels * 4 + 1

    mse = mean_squared_error(labels_original, preds_original)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(labels_original, preds_original)

    print(f"🔍 Test MSE (1-5): {mse:.4f}")
    print(f"📉 Test RMSE (1-5): {rmse:.4f}")
    print(f"📊 Test MAE  (1-5): {mae:.4f}")
    
    return mse, rmse, mae


In [None]:
mse, rmse, mae = evaluate_model(best_model, test_loader, criterion)


🔍 Test MSE (1-5): 0.7963
📉 Test RMSE (1-5): 0.8923
📊 Test MAE  (1-5): 0.7071


In [25]:
def recommend_movies(user_original_id, model, movies_df, ratings_df, user2idx, movie2idx, top_k=10):
    model.eval()

    # Convert original user ID to internal index
    user_idx = user2idx.get(user_original_id)
    if user_idx is None:
        print("⚠️ User not found.")
        return

    # Get movies the user has already rated
    seen_movies = ratings_df[ratings_df["UserID"] == user_idx]["MovieID"].values

    # List of movies the user hasn't seen yet
    unseen_movies = [mid for mid in movie2idx.values() if mid not in seen_movies]

    # Create tensors
    user_tensor = torch.tensor([user_idx] * len(unseen_movies), dtype=torch.long)
    movie_tensor = torch.tensor(unseen_movies, dtype=torch.long)

    # Predict ratings
    with torch.no_grad():
        predictions = model(user_tensor, movie_tensor)
        predictions = torch.clamp(predictions, 0.0, 1.0)  # ensure in [0, 1]
        predicted_scores = predictions.numpy() * 4 + 1  # rescale to [1, 5]

    # Get top K movie indices
    top_indices = np.argsort(predicted_scores)[-top_k:][::-1]
    top_movie_ids = [
        list(movie2idx.keys())[list(movie2idx.values()).index(unseen_movies[i])]
        for i in top_indices
    ]
    top_scores = [predicted_scores[i] for i in top_indices]

    # Print recommendations
    print(f"\n🎬 Top {top_k} movie recommendations for user {user_original_id}:\n")
    for title, score in zip(movies_df[movies_df["MovieID"].isin(top_movie_ids)]["Title"], top_scores):
        print(f"⭐ {title} - Predicted Rating: {score:.2f}")


In [26]:
recommend_movies(75, best_model, movies, ratings, user2idx, movie2idx)



🎬 Top 10 movie recommendations for user 75:

⭐ Usual Suspects, The (1995) - Predicted Rating: 4.89
⭐ Shawshank Redemption, The (1994) - Predicted Rating: 4.83
⭐ Schindler's List (1993) - Predicted Rating: 4.78
⭐ Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) - Predicted Rating: 4.78
⭐ Godfather, The (1972) - Predicted Rating: 4.78
⭐ Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) - Predicted Rating: 4.73
⭐ Raiders of the Lost Ark (1981) - Predicted Rating: 4.73
⭐ To Kill a Mockingbird (1962) - Predicted Rating: 4.73
⭐ Third Man, The (1949) - Predicted Rating: 4.73
⭐ Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Predicted Rating: 4.71
