
In this Jupyter Notebook, 1 will develop a movie recommendation system that leverages deep learning techniques to predict user preferences based on their past ratings.

I will cover the following steps:

1. **Download and preprocess the MovieLens dataset**

2. **Split the dataset into training, validation, and testing sets**

3. **Implement a neural network architecture for the recommendation system**
    
4. **Train and evaluate the model using different metrics**

So first of all, I will load the data in the different variables links, movies, ratings and tags from the .csv.

I will also display the first 5 rows of each one with the head() function and some metrics with the describe() function.

In [21]:
import pandas as pd

#MOVIES DATASET
movies = pd.read_csv("ml-latest-small/movies.csv")
print("Summary of the dataset: \n", movies.describe())
print("---------------------------------------------")
print("First 5 rows: \n", movies.head())

Summary of the dataset: 
              movieId
count    9742.000000
mean    42200.353623
std     52160.494854
min         1.000000
25%      3248.250000
50%      7300.000000
75%     76232.000000
max    193609.000000
---------------------------------------------
First 5 rows: 
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [22]:
#RATINGS DATASET
ratings = pd.read_csv("ml-latest-small/ratings.csv")
print("Summary of the dataset: \n",ratings.describe())
print("---------------------------------------------")
print("First 5 rows: \n", ratings.head())

Summary of the dataset: 
               userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%       177.000000    1199.000000       3.000000  1.019124e+09
50%       325.000000    2991.000000       3.500000  1.186087e+09
75%       477.000000    8122.000000       4.000000  1.435994e+09
max       610.000000  193609.000000       5.000000  1.537799e+09
---------------------------------------------
First 5 rows: 
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [23]:
#TAGS DATASET
tags = pd.read_csv("ml-latest-small/tags.csv")
print("Summary of the dataset: \n",tags.describe())
print("---------------------------------------------")
print("First 5 rows: \n", tags.head())

Summary of the dataset: 
             userId        movieId     timestamp
count  3683.000000    3683.000000  3.683000e+03
mean    431.149335   27252.013576  1.320032e+09
std     158.472553   43490.558803  1.721025e+08
min       2.000000       1.000000  1.137179e+09
25%     424.000000    1262.500000  1.137521e+09
50%     474.000000    4454.000000  1.269833e+09
75%     477.000000   39263.000000  1.498457e+09
max     610.000000  193565.000000  1.537099e+09
---------------------------------------------
First 5 rows: 
    userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200


In [24]:
#LINKS DATASET
links = pd.read_csv("ml-latest-small/links.csv")
print("Summary of the dataset: \n",links.describe())
print("---------------------------------------------")
print("First 5 rows: \n", links.head())

Summary of the dataset: 
              movieId        imdbId         tmdbId
count    9742.000000  9.742000e+03    9734.000000
mean    42200.353623  6.771839e+05   55162.123793
std     52160.494854  1.107228e+06   93653.481487
min         1.000000  4.170000e+02       2.000000
25%      3248.250000  9.518075e+04    9665.500000
50%      7300.000000  1.672605e+05   16529.000000
75%     76232.000000  8.055685e+05   44205.750000
max    193609.000000  8.391976e+06  525662.000000
---------------------------------------------
First 5 rows: 
    movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


# **FIRST MODEL**
# Using only original data (genres and ratings)


In [25]:
movies["genres"] = movies["genres"].fillna("").astype(str)
movies["genres"] = movies["genres"].str.split('|')
moviesExploded = movies.explode("genres")
movies_dummies = pd.get_dummies(moviesExploded["genres"], prefix="", prefix_sep="", dtype=int)
movies_dummies = moviesExploded[["movieId"]].join(movies_dummies).groupby("movieId").max()
movies_final = movies.drop(columns=["genres"]).merge(movies_dummies, on="movieId")
print(movies_final.head())

ratings["rating"] = ratings["rating"].astype(int)
print(ratings.head())

links["tmdbId"] = links["tmdbId"].fillna(-1).astype(int)
print(links.head())

print(tags.head())

   movieId                               title  (no genres listed)  Action  \
0        1                    Toy Story (1995)                   0       0   
1        2                      Jumanji (1995)                   0       0   
2        3             Grumpier Old Men (1995)                   0       0   
3        4            Waiting to Exhale (1995)                   0       0   
4        5  Father of the Bride Part II (1995)                   0       0   

   Adventure  Animation  Children  Comedy  Crime  Documentary  ...  Film-Noir  \
0          1          1         1       1      0            0  ...          0   
1          1          0         1       0      0            0  ...          0   
2          0          0         0       1      0            0  ...          0   
3          0          0         0       1      0            0  ...          0   
4          0          0         0       1      0            0  ...          0   

   Horror  IMAX  Musical  Mystery  Romance  

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np

df = ratings.merge(movies_final, on="movieId", how="left")

df2 = df.copy()
numerical_col = [col for col in df2.columns if col not in ["title"]]
df_shuffle = df2.sample(frac=1, random_state=123).drop(columns=["timestamp", "title"])

df_train = df_shuffle.iloc[:int(len(df_shuffle) * 0.8), :]
df_val = df_shuffle.iloc[int(len(df_shuffle) * 0.8):int(len(df_shuffle) * 0.9), :]
df_test = df_shuffle.iloc[int(len(df_shuffle) * 0.9):, :]

scalers = {}

feature_cols = [col for col in df_shuffle.columns if col not in ["rating", "timestamp", "title"]]
x_train, y_train = df_train[feature_cols].to_numpy(dtype=np.float32), df_train["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_val, y_val = df_val[feature_cols].to_numpy(dtype=np.float32), df_val["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_test, y_test = df_test[feature_cols].to_numpy(dtype=np.float32), df_test["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

x_train, y_train = torch.tensor(x_train), torch.tensor(y_train).float()
x_val, y_val = torch.tensor(x_val), torch.tensor(y_val).float()
x_test, y_test = torch.tensor(x_test), torch.tensor(y_test).float()

(80668, 22) (80668, 1)
(10084, 22) (10084, 1)
(10084, 22) (10084, 1)


In [27]:
import pandas as pd

avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")
avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_train = df_train.merge(avg_movie_rating, on="movieId", how="left")
df_train = df_train.merge(count_movie_rating, on="movieId", how="left")
df_train = df_train.merge(std_movie_rating, on="movieId", how="left")
df_train = df_train.merge(avg_user_rating, on="userId", how="left")
df_train = df_train.merge(count_user_rating, on="userId", how="left")
df_train = df_train.merge(std_user_rating, on="userId", how="left")

df_train["count_movie_rating"] = df_train["count_movie_rating"].fillna(0)
df_train["count_user_rating"] = df_train["count_user_rating"].fillna(0)
df_train["avg_movie_rating"] = df_train["avg_movie_rating"].fillna(0)
df_train["avg_user_rating"] = df_train["avg_user_rating"].fillna(0)
df_train["std_movie_rating"] = df_train["std_movie_rating"].fillna(0)
df_train["std_user_rating"] = df_train["std_user_rating"].fillna(0)



avg_movie_rating = df_val.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_val.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_val.groupby("movieId")["rating"].std().rename("std_movie_rating")
avg_user_rating = df_val.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_val.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_val.groupby("userId")["rating"].std().rename("std_user_rating")

df_val = df_val.merge(avg_movie_rating, on="movieId", how="left")
df_val = df_val.merge(count_movie_rating, on="movieId", how="left")
df_val = df_val.merge(std_movie_rating, on="movieId", how="left")
df_val = df_val.merge(avg_user_rating, on="userId", how="left")
df_val = df_val.merge(count_user_rating, on="userId", how="left")
df_val = df_val.merge(std_user_rating, on="userId", how="left")

df_val["count_movie_rating"] = df_val["count_movie_rating"].fillna(0)
df_val["count_user_rating"] = df_val["count_user_rating"].fillna(0)
df_val["avg_movie_rating"] = df_val["avg_movie_rating"].fillna(0)
df_val["avg_user_rating"] = df_val["avg_user_rating"].fillna(0)
df_val["std_movie_rating"] = df_val["std_movie_rating"].fillna(0)
df_val["std_user_rating"] = df_val["std_user_rating"].fillna(0)



avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")
avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_test = df_test.merge(avg_movie_rating, on="movieId", how="left")
df_test = df_test.merge(count_movie_rating, on="movieId", how="left")
df_test = df_test.merge(std_movie_rating, on="movieId", how="left")
df_test = df_test.merge(avg_user_rating, on="userId", how="left")
df_test = df_test.merge(count_user_rating, on="userId", how="left")
df_test = df_test.merge(std_user_rating, on="userId", how="left")

df_test["count_movie_rating"] = df_test["count_movie_rating"].fillna(0)
df_test["count_user_rating"] = df_test["count_user_rating"].fillna(0)
df_test["avg_movie_rating"] = df_test["avg_movie_rating"].fillna(0)
df_test["avg_user_rating"] = df_test["avg_user_rating"].fillna(0)
df_test["std_movie_rating"] = df_test["std_movie_rating"].fillna(0)
df_test["std_user_rating"] = df_test["std_user_rating"].fillna(0)


print(np.isnan(df_test).any())
print(df2.head())


userId                False
movieId               False
rating                False
(no genres listed)    False
Action                False
Adventure             False
Animation             False
Children              False
Comedy                False
Crime                 False
Documentary           False
Drama                 False
Fantasy               False
Film-Noir             False
Horror                False
IMAX                  False
Musical               False
Mystery               False
Romance               False
Sci-Fi                False
Thriller              False
War                   False
Western               False
avg_movie_rating      False
count_movie_rating    False
std_movie_rating      False
avg_user_rating       False
count_user_rating     False
std_user_rating       False
dtype: bool
   userId  movieId  rating  timestamp                        title  \
0       1        1       4  964982703             Toy Story (1995)   
1       1        3       4  96498124

In [28]:
numerical_col = ["movieId", "rating", "userId", "std_movie_rating",  "avg_movie_rating",  "count_movie_rating", "avg_user_rating",  "count_user_rating",  "std_user_rating"]
scaler = MinMaxScaler()

df_train[numerical_col] = scaler.fit_transform(df_train[numerical_col])
df_val[numerical_col] = scaler.fit_transform(df_val[numerical_col])
df_test[numerical_col] = scaler.fit_transform(df_test[numerical_col])

In [29]:
import torch
from torch.utils.data import TensorDataset, DataLoader

feature_cols = [col for col in df_train.columns if col != "rating"]

x_train = torch.tensor(df_train[feature_cols].values, dtype=torch.float32)
y_train = torch.tensor(df_train["rating"].values, dtype=torch.float32).unsqueeze(1)

x_val = torch.tensor(df_val[feature_cols].values, dtype=torch.float32)
y_val = torch.tensor(df_val["rating"].values, dtype=torch.float32).unsqueeze(1)

x_test = torch.tensor(df_test[feature_cols].values, dtype=torch.float32)
y_test = torch.tensor(df_test["rating"].values, dtype=torch.float32).unsqueeze(1)

torch.manual_seed(123)

train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1024, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=1024, shuffle=False)

In [30]:
import torch.nn as nn
import torch.optim as optim
import torch

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(28, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)
device = "cuda" 
model = NeuralNetwork().to(device)  #To change to the GPU

lossFunction = torch.nn.HuberLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)


In [31]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# TRAINING FUNCTION
def train_loop(dataloader, model, lossFunction, optimizer):
    train_size = len(dataloader.dataset)    
    nbatches = len(dataloader)  

    model.train()
    loss_train = 0  
    all_preds = []
    all_targets = []

    for nbatch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        logits = model(X)
        
        loss = lossFunction(logits, y)
        loss.backward()   
        optimizer.step()  
        optimizer.zero_grad()

        loss_train += loss.item()

        all_preds.extend(logits.detach().cpu().numpy()) 
        all_targets.extend(y.cpu().numpy())  

    avg_loss = loss_train / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'TRAINING -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


# VALIDATION FUNCTION
def val_loop(dataloader, model, lossFunction):
    val_size = len(dataloader.dataset)
    nbatches = len(dataloader)

    model.eval()

    loss_val = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            logits = model(X)

            loss_val += lossFunction(logits, y).item()
            
            all_preds.extend(logits.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    avg_loss = loss_val / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'VALIDATION -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


In [32]:

for i in range(50): 
    print(f"Iteration {i+1}/50 \n-----------------------------")
    train_loop(train_loader, model, lossFunction, optimizer)
    val_loop(val_loader, model, lossFunction)

Iteration 1/50 
-----------------------------
TRAINING -> Loss: 0.052538, MSE: 0.105261, RMSE: 0.324439, R²: -1.223391
VALIDATION -> Loss: 0.017285, MSE: 0.034591, RMSE: 0.185986, R²: 0.286081
Iteration 2/50 
-----------------------------
TRAINING -> Loss: 0.019013, MSE: 0.038034, RMSE: 0.195022, R²: 0.196624
VALIDATION -> Loss: 0.013096, MSE: 0.026218, RMSE: 0.161921, R²: 0.458879
Iteration 3/50 
-----------------------------
TRAINING -> Loss: 0.017171, MSE: 0.034358, RMSE: 0.185360, R²: 0.274259
VALIDATION -> Loss: 0.011494, MSE: 0.023015, RMSE: 0.151708, R²: 0.524987
Iteration 4/50 
-----------------------------
TRAINING -> Loss: 0.016355, MSE: 0.032714, RMSE: 0.180870, R²: 0.308993
VALIDATION -> Loss: 0.010864, MSE: 0.021754, RMSE: 0.147491, R²: 0.551029
Iteration 5/50 
-----------------------------
TRAINING -> Loss: 0.015886, MSE: 0.031773, RMSE: 0.178251, R²: 0.328862
VALIDATION -> Loss: 0.010596, MSE: 0.021222, RMSE: 0.145676, R²: 0.562010
Iteration 6/50 
-----------------------

In [33]:
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import binarize
from scipy.stats import rankdata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_train = x_train.to(device)
y_train = y_train.to(device)
model.to(device)
with torch.no_grad():
    y_train_pred = model(x_train)
y_train_np = y_train.cpu().numpy()
y_train_pred_np = y_train_pred.cpu().numpy()

# --- METRICS ---

# R^2 Score
ss_total = np.sum((y_train_np - np.mean(y_train_np)) ** 2)
ss_residual = np.sum((y_train_np - y_train_pred_np) ** 2)
r2_score = 1 - (ss_residual / ss_total) if ss_total != 0 else 0.0

# MAE
mae = np.mean(np.abs(y_train_np - y_train_pred_np))

# MSE
mse = np.mean((y_train_np - y_train_pred_np) ** 2)

# RMSE
rmse = np.sqrt(mse)

# ACCURACY AND RECALL
threshold = np.median(y_train_np)  

y_train_bin = binarize(y_train_np.reshape(-1, 1), threshold=threshold).flatten()
y_train_pred_bin = binarize(y_train_pred_np.reshape(-1, 1), threshold=threshold).flatten()

precision = precision_score(y_train_bin, y_train_pred_bin)
recall = recall_score(y_train_bin, y_train_pred_bin)

# NDCG 
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]  
    y_true_sorted = np.take(y_true, order[:k])
    
    gains = 2 ** y_true_sorted - 1
    discounts = np.log2(np.arange(2, len(y_true_sorted) + 2))
    
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best_dcg = dcg_score(y_true, y_true, k)  #
    actual_dcg = dcg_score(y_true, y_score, k)
    
    return actual_dcg / best_dcg if best_dcg > 0 else 0

ndcg = ndcg_score(y_train_np, y_train_pred_np)

print(f"R^2 Score: {r2_score:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"NDCG@10: {ndcg:.4f}")


R^2 Score: 0.4452
MAE: 0.1217
MSE: 0.0263
RMSE: 0.1621
Precisión: 0.6199
Recall: 0.9309
NDCG@10: 1.0000


# **SECOND MODEL**
# Adding to the first model the data from TMBD 


In [86]:
tmdb = pd.read_csv("ml-latest-small/movie_info_tmdb.csv", usecols=lambda column: column != "origin_country")
df = tmdb.merge(links, on="tmdbId", how="left")
df = df.merge(ratings, on="movieId", how="left")

df2 = df.copy()
df2 = df2.drop(columns=["title"])
df2["release_date"] = pd.to_datetime(df2["release_date"], errors="coerce").dt.year
df2 = pd.get_dummies(df2, columns=["original_language"], dtype=float)
columns_to_convert = [col for col in df2.columns if col != "title"]
df2[columns_to_convert] = df2[columns_to_convert].apply(pd.to_numeric, errors="coerce")
numerical_col = [col for col in df2.columns if col not in ["title"]]

In [87]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np

df_shuffle = df2.sample(frac=1, random_state=123).drop(columns=["timestamp"])
df_shuffle = df_shuffle.dropna()

df_train = df_shuffle.iloc[:int(len(df_shuffle) * 0.6), :]
df_val = df_shuffle.iloc[int(len(df_shuffle) * 0.6):int(len(df_shuffle) * 0.8), :]
df_test = df_shuffle.iloc[int(len(df_shuffle) * 0.8):, :]

scalers = {}

feature_cols = [col for col in df_shuffle.columns if col not in ["rating", "timestamp", "title"]]
x_train, y_train = df_train[feature_cols].to_numpy(dtype=np.float32), df_train["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_val, y_val = df_val[feature_cols].to_numpy(dtype=np.float32), df_val["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_test, y_test = df_test[feature_cols].to_numpy(dtype=np.float32), df_test["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

x_train, y_train = torch.tensor(x_train), torch.tensor(y_train).float()
x_val, y_val = torch.tensor(x_val), torch.tensor(y_val).float()
x_test, y_test = torch.tensor(x_test), torch.tensor(y_test).float()



(61074, 58) (61074, 1)
(20358, 58) (20358, 1)
(20358, 58) (20358, 1)


In [88]:
import pandas as pd

avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_train = df_train.merge(avg_movie_rating, on="movieId", how="left")
df_train = df_train.merge(count_movie_rating, on="movieId", how="left")
df_train = df_train.merge(std_movie_rating, on="movieId", how="left")

df_train = df_train.merge(avg_user_rating, on="userId", how="left")
df_train = df_train.merge(count_user_rating, on="userId", how="left")
df_train = df_train.merge(std_user_rating, on="userId", how="left")

df_train["count_movie_rating"] = df_train["count_movie_rating"].fillna(0)
df_train["count_user_rating"] = df_train["count_user_rating"].fillna(0)
df_train["avg_movie_rating"] = df_train["avg_movie_rating"].fillna(0)
df_train["avg_user_rating"] = df_train["avg_user_rating"].fillna(0)
df_train["std_movie_rating"] = df_train["std_movie_rating"].fillna(0)
df_train["std_user_rating"] = df_train["std_user_rating"].fillna(0)


avg_movie_rating = df_val.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_val.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_val.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_val.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_val.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_val.groupby("userId")["rating"].std().rename("std_user_rating")

df_val = df_val.merge(avg_movie_rating, on="movieId", how="left")
df_val = df_val.merge(count_movie_rating, on="movieId", how="left")
df_val = df_val.merge(std_movie_rating, on="movieId", how="left")

df_val = df_val.merge(avg_user_rating, on="userId", how="left")
df_val = df_val.merge(count_user_rating, on="userId", how="left")
df_val = df_val.merge(std_user_rating, on="userId", how="left")

df_val["count_movie_rating"] = df_val["count_movie_rating"].fillna(0)
df_val["count_user_rating"] = df_val["count_user_rating"].fillna(0)
df_val["avg_movie_rating"] = df_val["avg_movie_rating"].fillna(0)
df_val["avg_user_rating"] = df_val["avg_user_rating"].fillna(0)
df_val["std_movie_rating"] = df_val["std_movie_rating"].fillna(0)
df_val["std_user_rating"] = df_val["std_user_rating"].fillna(0)


avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_test = df_test.merge(avg_movie_rating, on="movieId", how="left")
df_test = df_test.merge(count_movie_rating, on="movieId", how="left")
df_test = df_test.merge(std_movie_rating, on="movieId", how="left")

df_test = df_test.merge(avg_user_rating, on="userId", how="left")
df_test = df_test.merge(count_user_rating, on="userId", how="left")
df_test = df_test.merge(std_user_rating, on="userId", how="left")

df_test["count_movie_rating"] = df_test["count_movie_rating"].fillna(0)
df_test["count_user_rating"] = df_test["count_user_rating"].fillna(0)
df_test["avg_movie_rating"] = df_test["avg_movie_rating"].fillna(0)
df_test["avg_user_rating"] = df_test["avg_user_rating"].fillna(0)
df_test["std_movie_rating"] = df_test["std_movie_rating"].fillna(0)
df_test["std_user_rating"] = df_test["std_user_rating"].fillna(0)


print(np.isnan(df_train).any())
print(df2.head())


tmdbId                False
ratingTmdb            False
release_date          False
votes                 False
budget                False
                      ...  
count_movie_rating    False
std_movie_rating      False
avg_user_rating       False
count_user_rating     False
std_user_rating       False
Length: 65, dtype: bool
   tmdbId  ratingTmdb  release_date    votes      budget      revenue  \
0     862         8.0        1995.0  18705.0  30000000.0  394436586.0   
1     862         8.0        1995.0  18705.0  30000000.0  394436586.0   
2     862         8.0        1995.0  18705.0  30000000.0  394436586.0   
3     862         8.0        1995.0  18705.0  30000000.0  394436586.0   
4     862         8.0        1995.0  18705.0  30000000.0  394436586.0   

   runtime  movieId  imdbId  userId  ...  original_language_sr  \
0     81.0        1  114709     1.0  ...                   0.0   
1     81.0        1  114709     5.0  ...                   0.0   
2     81.0        1  114709    

In [89]:
scaler = MinMaxScaler()

numerical_col = df_train.select_dtypes(include=['number']).columns
df_train[numerical_col] = scaler.fit_transform(df_train[numerical_col])
df_val[numerical_col] = scaler.transform(df_val[numerical_col])  # Usar transform en validación
df_test[numerical_col] = scaler.transform(df_test[numerical_col]) 

print(df_train.shape)

(61074, 65)


In [90]:
import torch
from torch.utils.data import TensorDataset, DataLoader

feature_cols = [col for col in df_train.columns if col != "rating"]

x_train = torch.tensor(df_train[feature_cols].values, dtype=torch.float32)
y_train = torch.tensor(df_train["rating"].values, dtype=torch.float32).unsqueeze(1)
x_val = torch.tensor(df_val[feature_cols].values, dtype=torch.float32)
y_val = torch.tensor(df_val["rating"].values, dtype=torch.float32).unsqueeze(1)
x_test = torch.tensor(df_test[feature_cols].values, dtype=torch.float32)
y_test = torch.tensor(df_test["rating"].values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1024, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=1024, shuffle=False)

In [91]:
import torch.nn as nn
import torch.optim as optim
import torch

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)
device = "cuda" 
model = NeuralNetwork().to(device)  #To change to the GPU

lossFunction = torch.nn.HuberLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

In [92]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# TRAINING FUNCTION
def train_loop(dataloader, model, lossFunction, optimizer):
    train_size = len(dataloader.dataset)    
    nbatches = len(dataloader)  

    model.train()
    loss_train = 0  
    all_preds = []
    all_targets = []

    for nbatch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        logits = model(X)
        
        loss = lossFunction(logits, y)
        loss.backward()   
        optimizer.step()  
        optimizer.zero_grad()

        loss_train += loss.item()

        all_preds.extend(logits.detach().cpu().numpy())  
        all_targets.extend(y.cpu().numpy())  


    avg_loss = loss_train / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'TRAINING -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


# VALIDATION FUNCTION
def val_loop(dataloader, model, lossFunction):
    val_size = len(dataloader.dataset)
    nbatches = len(dataloader)

    model.eval()

    loss_val = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            logits = model(X)

            loss_val += lossFunction(logits, y).item()
            
            all_preds.extend(logits.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    avg_loss = loss_val / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'VALIDATION -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')

In [93]:
for i in range(100): 
    print(f"Iteration {i+1}/50 \n-----------------------------")
    train_loop(train_loader, model, lossFunction, optimizer)
    val_loop(val_loader, model, lossFunction)

Iteration 1/50 
-----------------------------
TRAINING -> Loss: 0.060176, MSE: 0.120752, RMSE: 0.347494, R²: -1.543586
VALIDATION -> Loss: 0.018633, MSE: 0.037276, RMSE: 0.193070, R²: 0.209375
Iteration 2/50 
-----------------------------
TRAINING -> Loss: 0.025117, MSE: 0.050271, RMSE: 0.224212, R²: -0.058942
VALIDATION -> Loss: 0.013532, MSE: 0.027070, RMSE: 0.164529, R²: 0.425852
Iteration 3/50 
-----------------------------
TRAINING -> Loss: 0.020738, MSE: 0.041486, RMSE: 0.203680, R²: 0.126124
VALIDATION -> Loss: 0.012881, MSE: 0.025767, RMSE: 0.160520, R²: 0.453488
Iteration 4/50 
-----------------------------
TRAINING -> Loss: 0.019653, MSE: 0.039307, RMSE: 0.198260, R²: 0.172011
VALIDATION -> Loss: 0.012506, MSE: 0.025017, RMSE: 0.158168, R²: 0.469389
Iteration 5/50 
-----------------------------
TRAINING -> Loss: 0.018903, MSE: 0.037814, RMSE: 0.194458, R²: 0.203463
VALIDATION -> Loss: 0.012372, MSE: 0.024750, RMSE: 0.157320, R²: 0.475060
Iteration 6/50 
----------------------

In [94]:
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import binarize
from scipy.stats import rankdata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_train = x_train.to(device)
y_train = y_train.to(device)
model.to(device)
with torch.no_grad():
    y_train_pred = model(x_train)
y_train_np = y_train.cpu().numpy()
y_train_pred_np = y_train_pred.cpu().numpy()

# --- METRICS ---

# R^2 Score
ss_total = np.sum((y_train_np - np.mean(y_train_np)) ** 2)
ss_residual = np.sum((y_train_np - y_train_pred_np) ** 2)
r2_score = 1 - (ss_residual / ss_total) if ss_total != 0 else 0.0

# MAE
mae = np.mean(np.abs(y_train_np - y_train_pred_np))

# MSE
mse = np.mean((y_train_np - y_train_pred_np) ** 2)

# RMSE
rmse = np.sqrt(mse)

# ACCURACY AND RECALL
threshold = np.median(y_train_np)  

y_train_bin = binarize(y_train_np.reshape(-1, 1), threshold=threshold).flatten()
y_train_pred_bin = binarize(y_train_pred_np.reshape(-1, 1), threshold=threshold).flatten()

precision = precision_score(y_train_bin, y_train_pred_bin)
recall = recall_score(y_train_bin, y_train_pred_bin)

# NDCG 
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]  
    y_true_sorted = np.take(y_true, order[:k])
    
    gains = 2 ** y_true_sorted - 1
    discounts = np.log2(np.arange(2, len(y_true_sorted) + 2))
    
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best_dcg = dcg_score(y_true, y_true, k)  #
    actual_dcg = dcg_score(y_true, y_score, k)
    
    return actual_dcg / best_dcg if best_dcg > 0 else 0

ndcg = ndcg_score(y_train_np, y_train_pred_np)

print(f"R^2 Score: {r2_score:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

R^2 Score: 0.4637
MAE: 0.1192
MSE: 0.0255
RMSE: 0.1596
Precisión: 0.6020
Recall: 0.9477
NDCG@10: 1.0000


# **THIRD MODEL**
# Adding to the first model the data from IMDB 

In [41]:
imdb = pd.read_csv("ml-latest-small/movie_info_imdb.csv", usecols=lambda column: column != "origin_country")
imdb['imdbId'] = imdb['imdbId'].str.replace('tt', '', regex=False).astype(int)
columns_to_drop = ['Actors', 'Awards', 'DVD', 'Director', 'Genre', 'Title', 'Type', 'Website', 'Year', 'Poster', 'Production', 'Rated', 'Plot', 'Writer', 'Response', 'Ratings']
imdb = imdb.drop(columns=columns_to_drop)

imdb = imdb.astype({
    "BoxOffice": "string",  
    "Country": "string",
    "Language": "string",
    "Metascore": "float",
    "Released": "string",
    "Runtime": "string",
    "imdbRating": "float",
    "imdbVotes": "string"
})

imdb['BoxOffice'] = imdb['BoxOffice'].replace('[\$,]', '', regex=True).astype(float)
imdb['Released'] = imdb['Released'].str.extract(r'(\d{4})').astype(float)
imdb['Runtime'] = imdb['Runtime'].str.extract(r'(\d+)').astype(float)
imdb['imdbVotes'] = imdb['imdbVotes'].str.replace(',', '', regex=True).astype(float)

imdb = imdb.merge(links, on="imdbId", how="left")
imdb = imdb.merge(ratings, on="movieId", how="left")

imdb = imdb.dropna()

In [42]:
df2 = imdb
df2 = pd.get_dummies(df2, columns=["Country"], dtype=float)

print(df2.columns)
df2["Language"] = df2["Language"].str.split(",")
moviesExploded = df2.explode("Language")
moviesExploded["Language"] = moviesExploded["Language"].str.strip()
movies_dummies = pd.get_dummies(moviesExploded["Language"], dtype=int)
movies_dummies = moviesExploded[["movieId"]].join(movies_dummies).groupby("movieId").max()
df2 = df2.drop(columns=["Language"]).merge(movies_dummies, on="movieId")

columns_to_convert = [col for col in df2.columns]
df2[columns_to_convert] = df2[columns_to_convert].apply(pd.to_numeric, errors="coerce")

df2 = df2.dropna()

Index(['imdbId', 'BoxOffice', 'Language', 'Metascore', 'Released', 'Runtime',
       'imdbRating', 'imdbVotes', 'movieId', 'tmdbId',
       ...
       'Country_United States, United Kingdom, Switzerland, Panama',
       'Country_United States, Venezuela',
       'Country_United States, Vietnam, United Kingdom, Canada, Denmark',
       'Country_United States, West Germany', 'Country_West Germany',
       'Country_West Germany, France',
       'Country_West Germany, France, United Kingdom',
       'Country_West Germany, Italy, France',
       'Country_West Germany, United States',
       'Country_Yugoslavia, United States'],
      dtype='object', length=931)


In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np

df_shuffle = df2.sample(frac=1, random_state=123).drop(columns=["timestamp"])
df_shuffle = df_shuffle.dropna()

df_train = df_shuffle.iloc[:int(len(df_shuffle) * 0.6), :]
df_val = df_shuffle.iloc[int(len(df_shuffle) * 0.6):int(len(df_shuffle) * 0.8), :]
df_test = df_shuffle.iloc[int(len(df_shuffle) * 0.8):, :]

scalers = {}

feature_cols = [col for col in df_shuffle.columns]
x_train, y_train = df_train[feature_cols].to_numpy(dtype=np.float32), df_train["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_val, y_val = df_val[feature_cols].to_numpy(dtype=np.float32), df_val["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_test, y_test = df_test[feature_cols].to_numpy(dtype=np.float32), df_test["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

x_train, y_train = torch.tensor(x_train), torch.tensor(y_train).float()
x_val, y_val = torch.tensor(x_val), torch.tensor(y_val).float()
x_test, y_test = torch.tensor(x_test), torch.tensor(y_test).float()

(54776, 1087) (54776, 1)
(18259, 1087) (18259, 1)
(18259, 1087) (18259, 1)


In [44]:
import pandas as pd

avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_train = df_train.merge(avg_movie_rating, on="movieId", how="left")
df_train = df_train.merge(count_movie_rating, on="movieId", how="left")
df_train = df_train.merge(std_movie_rating, on="movieId", how="left")

df_train = df_train.merge(avg_user_rating, on="userId", how="left")
df_train = df_train.merge(count_user_rating, on="userId", how="left")
df_train = df_train.merge(std_user_rating, on="userId", how="left")

df_train["count_movie_rating"] = df_train["count_movie_rating"].fillna(0)
df_train["count_user_rating"] = df_train["count_user_rating"].fillna(0)
df_train["avg_movie_rating"] = df_train["avg_movie_rating"].fillna(0)
df_train["avg_user_rating"] = df_train["avg_user_rating"].fillna(0)
df_train["std_movie_rating"] = df_train["std_movie_rating"].fillna(0)
df_train["std_user_rating"] = df_train["std_user_rating"].fillna(0)



avg_movie_rating = df_val.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_val.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_val.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_val.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_val.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_val.groupby("userId")["rating"].std().rename("std_user_rating")

df_val = df_val.merge(avg_movie_rating, on="movieId", how="left")
df_val = df_val.merge(count_movie_rating, on="movieId", how="left")
df_val = df_val.merge(std_movie_rating, on="movieId", how="left")

df_val = df_val.merge(avg_user_rating, on="userId", how="left")
df_val = df_val.merge(count_user_rating, on="userId", how="left")
df_val = df_val.merge(std_user_rating, on="userId", how="left")

df_val["count_movie_rating"] = df_val["count_movie_rating"].fillna(0)
df_val["count_user_rating"] = df_val["count_user_rating"].fillna(0)
df_val["avg_movie_rating"] = df_val["avg_movie_rating"].fillna(0)
df_val["avg_user_rating"] = df_val["avg_user_rating"].fillna(0)
df_val["std_movie_rating"] = df_val["std_movie_rating"].fillna(0)
df_val["std_user_rating"] = df_val["std_user_rating"].fillna(0)



avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_test = df_test.merge(avg_movie_rating, on="movieId", how="left")
df_test = df_test.merge(count_movie_rating, on="movieId", how="left")
df_test = df_test.merge(std_movie_rating, on="movieId", how="left")

df_test = df_test.merge(avg_user_rating, on="userId", how="left")
df_test = df_test.merge(count_user_rating, on="userId", how="left")
df_test = df_test.merge(std_user_rating, on="userId", how="left")

df_test["count_movie_rating"] = df_test["count_movie_rating"].fillna(0)
df_test["count_user_rating"] = df_test["count_user_rating"].fillna(0)
df_test["avg_movie_rating"] = df_test["avg_movie_rating"].fillna(0)
df_test["avg_user_rating"] = df_test["avg_user_rating"].fillna(0)
df_test["std_movie_rating"] = df_test["std_movie_rating"].fillna(0)
df_test["std_user_rating"] = df_test["std_user_rating"].fillna(0)


print(np.isnan(df_train).any())
print(df2.head())

imdbId                False
BoxOffice             False
Metascore             False
Released              False
Runtime               False
                      ...  
count_movie_rating    False
std_movie_rating      False
avg_user_rating       False
count_user_rating     False
std_user_rating       False
Length: 1093, dtype: bool
   imdbId    BoxOffice  Metascore  Released  Runtime  imdbRating  imdbVotes  \
0  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
1  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
2  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
3  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
4  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   

   movieId  tmdbId  userId  ...  Ungwatsi  Urdu  Vietnamese  Washoe  Welsh  \
0      1.0   862.0     1.0  ...         0     0           0       0      0   
1      1.0   862.0     5.0  ...    

In [45]:
scaler = MinMaxScaler()

numerical_col = df_train.select_dtypes(include=['number']).columns

df_train[numerical_col] = scaler.fit_transform(df_train[numerical_col])
df_val[numerical_col] = scaler.transform(df_val[numerical_col])  
df_test[numerical_col] = scaler.transform(df_test[numerical_col]) 

print(df_train.shape)

(54776, 1093)


In [46]:
import torch
from torch.utils.data import TensorDataset, DataLoader

feature_cols = [col for col in df_train.columns if col != "rating"]

x_train = torch.tensor(df_train[feature_cols].values, dtype=torch.float32)
y_train = torch.tensor(df_train["rating"].values, dtype=torch.float32).unsqueeze(1)

x_val = torch.tensor(df_val[feature_cols].values, dtype=torch.float32)
y_val = torch.tensor(df_val["rating"].values, dtype=torch.float32).unsqueeze(1)

x_test = torch.tensor(df_test[feature_cols].values, dtype=torch.float32)
y_test = torch.tensor(df_test["rating"].values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1024, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=1024, shuffle=False)

In [50]:
import torch.nn as nn
import torch.optim as optim
import torch

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(1092, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.model(x)
device = "cuda" 
model = NeuralNetwork().to(device)  #To change to the GPU

lossFunction = torch.nn.HuberLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)


In [None]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# TRAINING FUNCTION
def train_loop(dataloader, model, lossFunction, optimizer):
    train_size = len(dataloader.dataset)    
    nbatches = len(dataloader)  

    model.train()
    loss_train = 0  
    all_preds = []
    all_targets = []

    for nbatch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        logits = model(X)
        
        loss = lossFunction(logits, y)
        loss.backward()   
        optimizer.step()  
        optimizer.zero_grad()

        loss_train += loss.item()

        all_preds.extend(logits.detach().cpu().numpy())  
        all_targets.extend(y.cpu().numpy())  


    avg_loss = loss_train / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'TRAINING -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


# VALIDATION FUNCTION
def val_loop(dataloader, model, lossFunction):
    val_size = len(dataloader.dataset)
    nbatches = len(dataloader)

    model.eval()

    loss_val = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            logits = model(X)

            loss_val += lossFunction(logits, y).item()
            
            all_preds.extend(logits.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    avg_loss = loss_val / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'VALIDATION -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')

In [52]:
for i in range(50): 
    print(f"Iteration {i+1}/50 \n-----------------------------")
    train_loop(train_loader, model, lossFunction, optimizer)
    val_loop(val_loader, model, lossFunction)

Iteration 1/50 
-----------------------------
TRAINING -> Loss: 0.049663, MSE: 0.099712, RMSE: 0.315772, R²: -1.120909
VALIDATION -> Loss: 0.019124, MSE: 0.038265, RMSE: 0.195613, R²: 0.188712
Iteration 2/50 
-----------------------------
TRAINING -> Loss: 0.025902, MSE: 0.051784, RMSE: 0.227561, R²: -0.101470
VALIDATION -> Loss: 0.015297, MSE: 0.030611, RMSE: 0.174961, R²: 0.350976
Iteration 3/50 
-----------------------------
TRAINING -> Loss: 0.023384, MSE: 0.046781, RMSE: 0.216290, R²: 0.004945
VALIDATION -> Loss: 0.015854, MSE: 0.031722, RMSE: 0.178108, R²: 0.327418
Iteration 4/50 
-----------------------------
TRAINING -> Loss: 0.022214, MSE: 0.044449, RMSE: 0.210830, R²: 0.054544
VALIDATION -> Loss: 0.014447, MSE: 0.028911, RMSE: 0.170034, R²: 0.387019
Iteration 5/50 
-----------------------------
TRAINING -> Loss: 0.021357, MSE: 0.042712, RMSE: 0.206669, R²: 0.091497
VALIDATION -> Loss: 0.013339, MSE: 0.026689, RMSE: 0.163367, R²: 0.434143
Iteration 6/50 
----------------------

In [53]:
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import binarize
from scipy.stats import rankdata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_train = x_train.to(device)
y_train = y_train.to(device)
model.to(device)
with torch.no_grad():
    y_train_pred = model(x_train)
y_train_np = y_train.cpu().numpy()
y_train_pred_np = y_train_pred.cpu().numpy()

# --- METRICS ---

# R^2 Score
ss_total = np.sum((y_train_np - np.mean(y_train_np)) ** 2)
ss_residual = np.sum((y_train_np - y_train_pred_np) ** 2)
r2_score = 1 - (ss_residual / ss_total) if ss_total != 0 else 0.0

# MAE
mae = np.mean(np.abs(y_train_np - y_train_pred_np))

# MSE
mse = np.mean((y_train_np - y_train_pred_np) ** 2)

# RMSE
rmse = np.sqrt(mse)

# ACCURACY AND RECALL
threshold = np.median(y_train_np)  

y_train_bin = binarize(y_train_np.reshape(-1, 1), threshold=threshold).flatten()
y_train_pred_bin = binarize(y_train_pred_np.reshape(-1, 1), threshold=threshold).flatten()

precision = precision_score(y_train_bin, y_train_pred_bin)
recall = recall_score(y_train_bin, y_train_pred_bin)

# NDCG 
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]  
    y_true_sorted = np.take(y_true, order[:k])
    
    gains = 2 ** y_true_sorted - 1
    discounts = np.log2(np.arange(2, len(y_true_sorted) + 2))
    
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best_dcg = dcg_score(y_true, y_true, k)  #
    actual_dcg = dcg_score(y_true, y_score, k)
    
    return actual_dcg / best_dcg if best_dcg > 0 else 0

ndcg = ndcg_score(y_train_np, y_train_pred_np)

print(f"R^2 Score: {r2_score:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

R^2 Score: 0.4849
MAE: 0.1184
MSE: 0.0242
RMSE: 0.1556
Precisión: 0.6555
Recall: 0.9182
NDCG@10: 1.0000


# **FOURTH MODEL**
# Adding to the first model the data from IMDB and TMDB

In [54]:
imdb = pd.read_csv("ml-latest-small/movie_info_imdb.csv", usecols=lambda column: column != "origin_country")
imdb['imdbId'] = imdb['imdbId'].str.replace('tt', '', regex=False).astype(int)
columns_to_drop = ['Actors', 'Awards', 'DVD', 'Director', 'Genre', 'Title', 'Type', 'Website', 'Year', 'Poster', 'Production', 'Rated', 'Plot', 'Writer', 'Response', 'Ratings']
imdb = imdb.drop(columns=columns_to_drop)

imdb = imdb.astype({
    "BoxOffice": "string",  
    "Country": "string",
    "Language": "string",
    "Metascore": "float",
    "Released": "string",
    "Runtime": "string",
    "imdbRating": "float",
    "imdbVotes": "string"
})

imdb['BoxOffice'] = imdb['BoxOffice'].replace('[\$,]', '', regex=True).astype(float)
imdb['Released'] = imdb['Released'].str.extract(r'(\d{4})').astype(float)
imdb['Runtime'] = imdb['Runtime'].str.extract(r'(\d+)').astype(float)
imdb['imdbVotes'] = imdb['imdbVotes'].str.replace(',', '', regex=True).astype(float)


tmdb = pd.read_csv("ml-latest-small/movie_info_tmdb.csv", usecols=lambda column: column != "origin_country")
tmdb = tmdb.drop(columns=["title", "original_language"])
tmdb["release_date"] = pd.to_datetime(tmdb["release_date"], errors="coerce").dt.year

imdb = imdb.merge(links, on="imdbId", how="left")
imdb = imdb.merge(ratings, on="movieId", how="left")
df2 = imdb.merge(tmdb, on="tmdbId", how="left")

imdb = imdb.dropna()
print(imdb.columns)

print(imdb.describe)

Index(['imdbId', 'BoxOffice', 'Country', 'Language', 'Metascore', 'Released',
       'Runtime', 'imdbRating', 'imdbVotes', 'movieId', 'tmdbId', 'userId',
       'rating', 'timestamp'],
      dtype='object')
<bound method NDFrame.describe of          imdbId    BoxOffice                               Country  \
0        114709  223225679.0                         United States   
1        114709  223225679.0                         United States   
2        114709  223225679.0                         United States   
3        114709  223225679.0                         United States   
4        114709  223225679.0                         United States   
...         ...          ...                                   ...   
100799  4912910  220159104.0  United States, China, France, Norway   
100800  4912910  220159104.0  United States, China, France, Norway   
100801  7690670   20545116.0                         United States   
100803  7349662   49275340.0                  United States

In [55]:
df2 = imdb

df2 = pd.get_dummies(df2, columns=["Country"], dtype=float)

df2["Language"] = df2["Language"].str.split(",")
moviesExploded = df2.explode("Language")
moviesExploded["Language"] = moviesExploded["Language"].str.strip()
movies_dummies = pd.get_dummies(moviesExploded["Language"], dtype=int)
movies_dummies = moviesExploded[["movieId"]].join(movies_dummies).groupby("movieId").max()
df2 = df2.drop(columns=["Language"]).merge(movies_dummies, on="movieId")

columns_to_convert = [col for col in df2.columns]
df2[columns_to_convert] = df2[columns_to_convert].apply(pd.to_numeric, errors="coerce")

df2 = df2.dropna()

print(df2.columns)
print(df2.head)

Index(['imdbId', 'BoxOffice', 'Metascore', 'Released', 'Runtime', 'imdbRating',
       'imdbVotes', 'movieId', 'tmdbId', 'userId',
       ...
       'Ungwatsi', 'Urdu', 'Vietnamese', 'Washoe', 'Welsh', 'Wolof', 'Xhosa',
       'Yiddish', 'Yoruba', 'Zulu'],
      dtype='object', length=1088)
<bound method NDFrame.head of         imdbId    BoxOffice  Metascore  Released  Runtime  imdbRating  \
0       114709  223225679.0       96.0    1995.0     81.0         8.3   
1       114709  223225679.0       96.0    1995.0     81.0         8.3   
2       114709  223225679.0       96.0    1995.0     81.0         8.3   
3       114709  223225679.0       96.0    1995.0     81.0         8.3   
4       114709  223225679.0       96.0    1995.0     81.0         8.3   
...        ...          ...        ...       ...      ...         ...   
91289  4912910  220159104.0       87.0    2018.0    147.0         7.7   
91290  4912910  220159104.0       87.0    2018.0    147.0         7.7   
91291  7690670   2054

In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np

df_shuffle = df2.sample(frac=1, random_state=123).drop(columns=["timestamp"])

df_shuffle = df_shuffle.dropna()

df_train = df_shuffle.iloc[:int(len(df_shuffle) * 0.6), :]
df_val = df_shuffle.iloc[int(len(df_shuffle) * 0.6):int(len(df_shuffle) * 0.8), :]
df_test = df_shuffle.iloc[int(len(df_shuffle) * 0.8):, :]

scalers = {}

feature_cols = [col for col in df_shuffle.columns]
x_train, y_train = df_train[feature_cols].to_numpy(dtype=np.float32), df_train["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_val, y_val = df_val[feature_cols].to_numpy(dtype=np.float32), df_val["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_test, y_test = df_test[feature_cols].to_numpy(dtype=np.float32), df_test["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

x_train, y_train = torch.tensor(x_train), torch.tensor(y_train).float()
x_val, y_val = torch.tensor(x_val), torch.tensor(y_val).float()
x_test, y_test = torch.tensor(x_test), torch.tensor(y_test).float()

(54776, 1087) (54776, 1)
(18259, 1087) (18259, 1)
(18259, 1087) (18259, 1)


In [57]:
import pandas as pd

print(df2.columns)

avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_train = df_train.merge(avg_movie_rating, on="movieId", how="left")
df_train = df_train.merge(count_movie_rating, on="movieId", how="left")
df_train = df_train.merge(std_movie_rating, on="movieId", how="left")

df_train = df_train.merge(avg_user_rating, on="userId", how="left")
df_train = df_train.merge(count_user_rating, on="userId", how="left")
df_train = df_train.merge(std_user_rating, on="userId", how="left")

df_train["count_movie_rating"] = df_train["count_movie_rating"].fillna(0)
df_train["count_user_rating"] = df_train["count_user_rating"].fillna(0)
df_train["avg_movie_rating"] = df_train["avg_movie_rating"].fillna(0)
df_train["avg_user_rating"] = df_train["avg_user_rating"].fillna(0)
df_train["std_movie_rating"] = df_train["std_movie_rating"].fillna(0)
df_train["std_user_rating"] = df_train["std_user_rating"].fillna(0)



avg_movie_rating = df_val.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_val.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_val.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_val.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_val.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_val.groupby("userId")["rating"].std().rename("std_user_rating")

df_val = df_val.merge(avg_movie_rating, on="movieId", how="left")
df_val = df_val.merge(count_movie_rating, on="movieId", how="left")
df_val = df_val.merge(std_movie_rating, on="movieId", how="left")

df_val = df_val.merge(avg_user_rating, on="userId", how="left")
df_val = df_val.merge(count_user_rating, on="userId", how="left")
df_val = df_val.merge(std_user_rating, on="userId", how="left")

df_val["count_movie_rating"] = df_val["count_movie_rating"].fillna(0)
df_val["count_user_rating"] = df_val["count_user_rating"].fillna(0)
df_val["avg_movie_rating"] = df_val["avg_movie_rating"].fillna(0)
df_val["avg_user_rating"] = df_val["avg_user_rating"].fillna(0)
df_val["std_movie_rating"] = df_val["std_movie_rating"].fillna(0)
df_val["std_user_rating"] = df_val["std_user_rating"].fillna(0)



avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_test = df_test.merge(avg_movie_rating, on="movieId", how="left")
df_test = df_test.merge(count_movie_rating, on="movieId", how="left")
df_test = df_test.merge(std_movie_rating, on="movieId", how="left")

df_test = df_test.merge(avg_user_rating, on="userId", how="left")
df_test = df_test.merge(count_user_rating, on="userId", how="left")
df_test = df_test.merge(std_user_rating, on="userId", how="left")

df_test["count_movie_rating"] = df_test["count_movie_rating"].fillna(0)
df_test["count_user_rating"] = df_test["count_user_rating"].fillna(0)
df_test["avg_movie_rating"] = df_test["avg_movie_rating"].fillna(0)
df_test["avg_user_rating"] = df_test["avg_user_rating"].fillna(0)
df_test["std_movie_rating"] = df_test["std_movie_rating"].fillna(0)
df_test["std_user_rating"] = df_test["std_user_rating"].fillna(0)


print(np.isnan(df_train).any())
print(df2.head())

Index(['imdbId', 'BoxOffice', 'Metascore', 'Released', 'Runtime', 'imdbRating',
       'imdbVotes', 'movieId', 'tmdbId', 'userId',
       ...
       'Ungwatsi', 'Urdu', 'Vietnamese', 'Washoe', 'Welsh', 'Wolof', 'Xhosa',
       'Yiddish', 'Yoruba', 'Zulu'],
      dtype='object', length=1088)
imdbId                False
BoxOffice             False
Metascore             False
Released              False
Runtime               False
                      ...  
count_movie_rating    False
std_movie_rating      False
avg_user_rating       False
count_user_rating     False
std_user_rating       False
Length: 1093, dtype: bool
   imdbId    BoxOffice  Metascore  Released  Runtime  imdbRating  imdbVotes  \
0  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
1  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
2  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
3  114709  223225679.0       96.0    1995.0     81.0      

In [58]:
scaler = MinMaxScaler()

numerical_col = df_train.select_dtypes(include=['number']).columns

df_train[numerical_col] = scaler.fit_transform(df_train[numerical_col])
df_val[numerical_col] = scaler.transform(df_val[numerical_col])  
df_test[numerical_col] = scaler.transform(df_test[numerical_col]) 

print(df_train.shape)

(54776, 1093)


In [59]:
import torch
from torch.utils.data import TensorDataset, DataLoader

feature_cols = [col for col in df_train.columns if col != "rating"]

x_train = torch.tensor(df_train[feature_cols].values, dtype=torch.float32)
y_train = torch.tensor(df_train["rating"].values, dtype=torch.float32).unsqueeze(1)

x_val = torch.tensor(df_val[feature_cols].values, dtype=torch.float32)
y_val = torch.tensor(df_val["rating"].values, dtype=torch.float32).unsqueeze(1)

x_test = torch.tensor(df_test[feature_cols].values, dtype=torch.float32)
y_test = torch.tensor(df_test["rating"].values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1024, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=1024, shuffle=False)

In [60]:
import torch.nn as nn
import torch.optim as optim
import torch

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(1092, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.model(x)
device = "cuda" 
model = NeuralNetwork().to(device)  #To change to the GPU

lossFunction = torch.nn.HuberLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

In [61]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# TRAINING FUNCTION
def train_loop(dataloader, model, lossFunction, optimizer):
    train_size = len(dataloader.dataset)    
    nbatches = len(dataloader)  

    model.train()
    loss_train = 0  
    all_preds = []
    all_targets = []

    for nbatch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        logits = model(X)
        
        loss = lossFunction(logits, y)
        loss.backward()   
        optimizer.step()  
        optimizer.zero_grad()

        loss_train += loss.item()

        all_preds.extend(logits.detach().cpu().numpy())  
        all_targets.extend(y.cpu().numpy())  

    avg_loss = loss_train / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'TRAINING -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


# VALIDATION FUNCTION
def val_loop(dataloader, model, lossFunction):
    val_size = len(dataloader.dataset)
    nbatches = len(dataloader)

    model.eval()

    loss_val = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            logits = model(X)

            loss_val += lossFunction(logits, y).item()
            
            all_preds.extend(logits.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    avg_loss = loss_val / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'VALIDATION -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


In [62]:
for i in range(50): 
    print(f"Iteration {i+1}/50 \n-----------------------------")
    train_loop(train_loader, model, lossFunction, optimizer)
    val_loop(val_loader, model, lossFunction)

Iteration 1/50 
-----------------------------
TRAINING -> Loss: 0.112031, MSE: 0.226171, RMSE: 0.475574, R²: -3.810732
VALIDATION -> Loss: 0.029011, MSE: 0.058039, RMSE: 0.240912, R²: -0.230538
Iteration 2/50 
-----------------------------
TRAINING -> Loss: 0.040232, MSE: 0.080550, RMSE: 0.283813, R²: -0.713323
VALIDATION -> Loss: 0.016536, MSE: 0.033087, RMSE: 0.181899, R²: 0.298481
Iteration 3/50 
-----------------------------
TRAINING -> Loss: 0.035301, MSE: 0.070579, RMSE: 0.265667, R²: -0.501242
VALIDATION -> Loss: 0.015475, MSE: 0.030958, RMSE: 0.175948, R²: 0.343631
Iteration 4/50 
-----------------------------
TRAINING -> Loss: 0.033386, MSE: 0.066765, RMSE: 0.258390, R²: -0.420126
VALIDATION -> Loss: 0.014151, MSE: 0.028317, RMSE: 0.168276, R²: 0.399626
Iteration 5/50 
-----------------------------
TRAINING -> Loss: 0.031927, MSE: 0.063914, RMSE: 0.252812, R²: -0.359469
VALIDATION -> Loss: 0.013792, MSE: 0.027600, RMSE: 0.166131, R²: 0.414833
Iteration 6/50 
------------------

In [63]:
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import binarize
from scipy.stats import rankdata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_train = x_train.to(device)
y_train = y_train.to(device)
model.to(device)
with torch.no_grad():
    y_train_pred = model(x_train)
y_train_np = y_train.cpu().numpy()
y_train_pred_np = y_train_pred.cpu().numpy()

# --- METRICS ---

# R^2 Score
ss_total = np.sum((y_train_np - np.mean(y_train_np)) ** 2)
ss_residual = np.sum((y_train_np - y_train_pred_np) ** 2)
r2_score = 1 - (ss_residual / ss_total) if ss_total != 0 else 0.0

# MAE
mae = np.mean(np.abs(y_train_np - y_train_pred_np))

# MSE
mse = np.mean((y_train_np - y_train_pred_np) ** 2)

# RMSE
rmse = np.sqrt(mse)

# ACCURACY AND RECALL
threshold = np.median(y_train_np)  

y_train_bin = binarize(y_train_np.reshape(-1, 1), threshold=threshold).flatten()
y_train_pred_bin = binarize(y_train_pred_np.reshape(-1, 1), threshold=threshold).flatten()

precision = precision_score(y_train_bin, y_train_pred_bin)
recall = recall_score(y_train_bin, y_train_pred_bin)

# NDCG 
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]  
    y_true_sorted = np.take(y_true, order[:k])
    
    gains = 2 ** y_true_sorted - 1
    discounts = np.log2(np.arange(2, len(y_true_sorted) + 2))
    
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best_dcg = dcg_score(y_true, y_true, k)  #
    actual_dcg = dcg_score(y_true, y_score, k)
    
    return actual_dcg / best_dcg if best_dcg > 0 else 0

ndcg = ndcg_score(y_train_np, y_train_pred_np)

print(f"R^2 Score: {r2_score:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

R^2 Score: 0.4667
MAE: 0.1214
MSE: 0.0251
RMSE: 0.1583
Precisión: 0.6486
Recall: 0.9138
NDCG@10: 1.0000


# **FIFTH MODEL**
# Like the fourth one but using the 1M ratings dataset

In [76]:
imdb = pd.read_csv("ml-latest-small/movie_info_imdb.csv", usecols=lambda column: column != "origin_country")
imdb['imdbId'] = imdb['imdbId'].str.replace('tt', '', regex=False).astype(int)
columns_to_drop = ['Actors', 'Awards', 'DVD', 'Director', 'Genre', 'Title', 'Type', 'Website', 'Year', 'Poster', 'Production', 'Rated', 'Plot', 'Writer', 'Response', 'Ratings']
imdb = imdb.drop(columns=columns_to_drop)

imdb = imdb.astype({
    "BoxOffice": "string", 
    "Country": "string",
    "Language": "string",
    "Metascore": "float",
    "Released": "string",
    "Runtime": "string",
    "imdbRating": "float",
    "imdbVotes": "string"
})

imdb['BoxOffice'] = imdb['BoxOffice'].replace('[\$,]', '', regex=True).astype(float)
imdb['Released'] = imdb['Released'].str.extract(r'(\d{4})').astype(float)
imdb['Runtime'] = imdb['Runtime'].str.extract(r'(\d+)').astype(float)
imdb['imdbVotes'] = imdb['imdbVotes'].str.replace(',', '', regex=True).astype(float)

tmdb = pd.read_csv("ml-latest-small/movie_info_tmdb.csv", usecols=lambda column: column != "origin_country")
tmdb = tmdb.drop(columns=["title", "original_language"])
tmdb["release_date"] = pd.to_datetime(tmdb["release_date"], errors="coerce").dt.year

imdb = imdb.merge(links, on="imdbId", how="left")
imdb = imdb.merge(ratings, on="movieId", how="left")
df2 = imdb.merge(tmdb, on="tmdbId", how="left")

imdb = imdb.dropna()
print(imdb.columns)
print(imdb.describe)

Index(['imdbId', 'BoxOffice', 'Country', 'Language', 'Metascore', 'Released',
       'Runtime', 'imdbRating', 'imdbVotes', 'movieId', 'tmdbId', 'userId',
       'rating', 'timestamp'],
      dtype='object')
<bound method NDFrame.describe of          imdbId    BoxOffice                               Country  \
0        114709  223225679.0                         United States   
1        114709  223225679.0                         United States   
2        114709  223225679.0                         United States   
3        114709  223225679.0                         United States   
4        114709  223225679.0                         United States   
...         ...          ...                                   ...   
100799  4912910  220159104.0  United States, China, France, Norway   
100800  4912910  220159104.0  United States, China, France, Norway   
100801  7690670   20545116.0                         United States   
100803  7349662   49275340.0                  United States

In [77]:
df2 = imdb
df2 = pd.get_dummies(df2, columns=["Country"], dtype=float)

df2["Language"] = df2["Language"].str.split(",")
moviesExploded = df2.explode("Language")
moviesExploded["Language"] = moviesExploded["Language"].str.strip()
movies_dummies = pd.get_dummies(moviesExploded["Language"], dtype=int)
movies_dummies = moviesExploded[["movieId"]].join(movies_dummies).groupby("movieId").max()
df2 = df2.drop(columns=["Language"]).merge(movies_dummies, on="movieId")


columns_to_convert = [col for col in df2.columns]
df2[columns_to_convert] = df2[columns_to_convert].apply(pd.to_numeric, errors="coerce")

df2 = df2.dropna()

print(df2.columns)
print(df2.head)

Index(['imdbId', 'BoxOffice', 'Metascore', 'Released', 'Runtime', 'imdbRating',
       'imdbVotes', 'movieId', 'tmdbId', 'userId',
       ...
       'Ungwatsi', 'Urdu', 'Vietnamese', 'Washoe', 'Welsh', 'Wolof', 'Xhosa',
       'Yiddish', 'Yoruba', 'Zulu'],
      dtype='object', length=1088)
<bound method NDFrame.head of         imdbId    BoxOffice  Metascore  Released  Runtime  imdbRating  \
0       114709  223225679.0       96.0    1995.0     81.0         8.3   
1       114709  223225679.0       96.0    1995.0     81.0         8.3   
2       114709  223225679.0       96.0    1995.0     81.0         8.3   
3       114709  223225679.0       96.0    1995.0     81.0         8.3   
4       114709  223225679.0       96.0    1995.0     81.0         8.3   
...        ...          ...        ...       ...      ...         ...   
91289  4912910  220159104.0       87.0    2018.0    147.0         7.7   
91290  4912910  220159104.0       87.0    2018.0    147.0         7.7   
91291  7690670   2054

In [78]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np

df_shuffle = df2.sample(frac=1, random_state=123).drop(columns=["timestamp"])
df_shuffle = df_shuffle.dropna()

df_train = df_shuffle.iloc[:int(len(df_shuffle) * 0.6), :]
df_val = df_shuffle.iloc[int(len(df_shuffle) * 0.6):int(len(df_shuffle) * 0.8), :]
df_test = df_shuffle.iloc[int(len(df_shuffle) * 0.8):, :]

scalers = {}

feature_cols = [col for col in df_shuffle.columns]
x_train, y_train = df_train[feature_cols].to_numpy(dtype=np.float32), df_train["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_val, y_val = df_val[feature_cols].to_numpy(dtype=np.float32), df_val["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)
x_test, y_test = df_test[feature_cols].to_numpy(dtype=np.float32), df_test["rating"].to_numpy(dtype=np.float32).reshape(-1, 1)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

x_train, y_train = torch.tensor(x_train), torch.tensor(y_train).float()
x_val, y_val = torch.tensor(x_val), torch.tensor(y_val).float()
x_test, y_test = torch.tensor(x_test), torch.tensor(y_test).float()

(54776, 1087) (54776, 1)
(18259, 1087) (18259, 1)
(18259, 1087) (18259, 1)


In [79]:
import pandas as pd

print(df2.columns)

avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_train = df_train.merge(avg_movie_rating, on="movieId", how="left")
df_train = df_train.merge(count_movie_rating, on="movieId", how="left")
df_train = df_train.merge(std_movie_rating, on="movieId", how="left")

df_train = df_train.merge(avg_user_rating, on="userId", how="left")
df_train = df_train.merge(count_user_rating, on="userId", how="left")
df_train = df_train.merge(std_user_rating, on="userId", how="left")

df_train["count_movie_rating"] = df_train["count_movie_rating"].fillna(0)
df_train["count_user_rating"] = df_train["count_user_rating"].fillna(0)
df_train["avg_movie_rating"] = df_train["avg_movie_rating"].fillna(0)
df_train["avg_user_rating"] = df_train["avg_user_rating"].fillna(0)
df_train["std_movie_rating"] = df_train["std_movie_rating"].fillna(0)
df_train["std_user_rating"] = df_train["std_user_rating"].fillna(0)



avg_movie_rating = df_val.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_val.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_val.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_val.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_val.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_val.groupby("userId")["rating"].std().rename("std_user_rating")

df_val = df_val.merge(avg_movie_rating, on="movieId", how="left")
df_val = df_val.merge(count_movie_rating, on="movieId", how="left")
df_val = df_val.merge(std_movie_rating, on="movieId", how="left")

df_val = df_val.merge(avg_user_rating, on="userId", how="left")
df_val = df_val.merge(count_user_rating, on="userId", how="left")
df_val = df_val.merge(std_user_rating, on="userId", how="left")

df_val["count_movie_rating"] = df_val["count_movie_rating"].fillna(0)
df_val["count_user_rating"] = df_val["count_user_rating"].fillna(0)
df_val["avg_movie_rating"] = df_val["avg_movie_rating"].fillna(0)
df_val["avg_user_rating"] = df_val["avg_user_rating"].fillna(0)
df_val["std_movie_rating"] = df_val["std_movie_rating"].fillna(0)
df_val["std_user_rating"] = df_val["std_user_rating"].fillna(0)



avg_movie_rating = df_train.groupby("movieId")["rating"].mean().rename("avg_movie_rating")
count_movie_rating = df_train.groupby("movieId")["rating"].count().rename("count_movie_rating")
std_movie_rating = df_train.groupby("movieId")["rating"].std().rename("std_movie_rating")

avg_user_rating = df_train.groupby("userId")["rating"].mean().rename("avg_user_rating")
count_user_rating = df_train.groupby("userId")["rating"].count().rename("count_user_rating")
std_user_rating = df_train.groupby("userId")["rating"].std().rename("std_user_rating")

df_test = df_test.merge(avg_movie_rating, on="movieId", how="left")
df_test = df_test.merge(count_movie_rating, on="movieId", how="left")
df_test = df_test.merge(std_movie_rating, on="movieId", how="left")

df_test = df_test.merge(avg_user_rating, on="userId", how="left")
df_test = df_test.merge(count_user_rating, on="userId", how="left")
df_test = df_test.merge(std_user_rating, on="userId", how="left")

df_test["count_movie_rating"] = df_test["count_movie_rating"].fillna(0)
df_test["count_user_rating"] = df_test["count_user_rating"].fillna(0)
df_test["avg_movie_rating"] = df_test["avg_movie_rating"].fillna(0)
df_test["avg_user_rating"] = df_test["avg_user_rating"].fillna(0)
df_test["std_movie_rating"] = df_test["std_movie_rating"].fillna(0)
df_test["std_user_rating"] = df_test["std_user_rating"].fillna(0)


print(np.isnan(df_train).any())
print(df2.head())

Index(['imdbId', 'BoxOffice', 'Metascore', 'Released', 'Runtime', 'imdbRating',
       'imdbVotes', 'movieId', 'tmdbId', 'userId',
       ...
       'Ungwatsi', 'Urdu', 'Vietnamese', 'Washoe', 'Welsh', 'Wolof', 'Xhosa',
       'Yiddish', 'Yoruba', 'Zulu'],
      dtype='object', length=1088)
imdbId                False
BoxOffice             False
Metascore             False
Released              False
Runtime               False
                      ...  
count_movie_rating    False
std_movie_rating      False
avg_user_rating       False
count_user_rating     False
std_user_rating       False
Length: 1093, dtype: bool
   imdbId    BoxOffice  Metascore  Released  Runtime  imdbRating  imdbVotes  \
0  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
1  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
2  114709  223225679.0       96.0    1995.0     81.0         8.3  1112586.0   
3  114709  223225679.0       96.0    1995.0     81.0      

In [80]:
scaler = MinMaxScaler()

numerical_col = df_train.select_dtypes(include=['number']).columns

df_train[numerical_col] = scaler.fit_transform(df_train[numerical_col])
df_val[numerical_col] = scaler.transform(df_val[numerical_col])  # Usar transform en validación
df_test[numerical_col] = scaler.transform(df_test[numerical_col]) 

print(df_train.shape)

(54776, 1093)


In [81]:
import torch
from torch.utils.data import TensorDataset, DataLoader

feature_cols = [col for col in df_train.columns if col != "rating"]

x_train = torch.tensor(df_train[feature_cols].values, dtype=torch.float32)
y_train = torch.tensor(df_train["rating"].values, dtype=torch.float32).unsqueeze(1)

x_val = torch.tensor(df_val[feature_cols].values, dtype=torch.float32)
y_val = torch.tensor(df_val["rating"].values, dtype=torch.float32).unsqueeze(1)

x_test = torch.tensor(df_test[feature_cols].values, dtype=torch.float32)
y_test = torch.tensor(df_test["rating"].values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1024, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=1024, shuffle=False)

In [82]:
import torch.nn as nn
import torch.optim as optim
import torch

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(1092, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.model(x)
device = "cuda" 
model = NeuralNetwork().to(device)  #To change to the GPU

lossFunction = torch.nn.HuberLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

In [83]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# TRAINING FUNCTION
def train_loop(dataloader, model, lossFunction, optimizer):
    train_size = len(dataloader.dataset)    
    nbatches = len(dataloader)  

    model.train()
    loss_train = 0  
    all_preds = []
    all_targets = []

    for nbatch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        logits = model(X)
        
        loss = lossFunction(logits, y)
        loss.backward()   
        optimizer.step()  
        optimizer.zero_grad()

        loss_train += loss.item()

        all_preds.extend(logits.detach().cpu().numpy())  
        all_targets.extend(y.cpu().numpy())  


    avg_loss = loss_train / nbatches

    
    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'TRAINING -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')


# VALIDATION FUNCTION
def val_loop(dataloader, model, lossFunction):
    val_size = len(dataloader.dataset)
    nbatches = len(dataloader)

    model.eval()

    loss_val = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            logits = model(X)

            loss_val += lossFunction(logits, y).item()
            
            all_preds.extend(logits.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    avg_loss = loss_val / nbatches

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f'VALIDATION -> Loss: {avg_loss:.6f}, MSE: {mse:.6f}, RMSE: {rmse:.6f}, R²: {r2:.6f}')

In [84]:
for i in range(50): 
    print(f"Iteration {i+1}/50 \n-----------------------------")
    train_loop(train_loader, model, lossFunction, optimizer)
    val_loop(val_loader, model, lossFunction)

Iteration 1/50 
-----------------------------
TRAINING -> Loss: 0.051368, MSE: 0.103088, RMSE: 0.321073, R²: -1.192717
VALIDATION -> Loss: 0.018047, MSE: 0.036114, RMSE: 0.190037, R²: 0.234312
Iteration 2/50 
-----------------------------
TRAINING -> Loss: 0.027441, MSE: 0.054928, RMSE: 0.234367, R²: -0.168337
VALIDATION -> Loss: 0.016460, MSE: 0.032939, RMSE: 0.181491, R²: 0.301628
Iteration 3/50 
-----------------------------
TRAINING -> Loss: 0.024306, MSE: 0.048671, RMSE: 0.220615, R²: -0.035254
VALIDATION -> Loss: 0.015325, MSE: 0.030668, RMSE: 0.175122, R²: 0.349782
Iteration 4/50 
-----------------------------
TRAINING -> Loss: 0.022778, MSE: 0.045515, RMSE: 0.213342, R²: 0.031882
VALIDATION -> Loss: 0.014125, MSE: 0.028265, RMSE: 0.168123, R²: 0.400714
Iteration 5/50 
-----------------------------
TRAINING -> Loss: 0.022042, MSE: 0.044118, RMSE: 0.210042, R²: 0.061603
VALIDATION -> Loss: 0.013755, MSE: 0.027521, RMSE: 0.165896, R²: 0.416491
Iteration 6/50 
---------------------

In [85]:
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import binarize
from scipy.stats import rankdata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x_train = x_train.to(device)
y_train = y_train.to(device)
model.to(device)
with torch.no_grad():
    y_train_pred = model(x_train)
y_train_np = y_train.cpu().numpy()
y_train_pred_np = y_train_pred.cpu().numpy()

# --- METRICS ---

# R^2 Score
ss_total = np.sum((y_train_np - np.mean(y_train_np)) ** 2)
ss_residual = np.sum((y_train_np - y_train_pred_np) ** 2)
r2_score = 1 - (ss_residual / ss_total) if ss_total != 0 else 0.0

# MAE
mae = np.mean(np.abs(y_train_np - y_train_pred_np))

# MSE
mse = np.mean((y_train_np - y_train_pred_np) ** 2)

# RMSE
rmse = np.sqrt(mse)

# ACCURACY AND RECALL
threshold = np.median(y_train_np)  

y_train_bin = binarize(y_train_np.reshape(-1, 1), threshold=threshold).flatten()
y_train_pred_bin = binarize(y_train_pred_np.reshape(-1, 1), threshold=threshold).flatten()

precision = precision_score(y_train_bin, y_train_pred_bin)
recall = recall_score(y_train_bin, y_train_pred_bin)

# NDCG 
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]  
    y_true_sorted = np.take(y_true, order[:k])
    
    gains = 2 ** y_true_sorted - 1
    discounts = np.log2(np.arange(2, len(y_true_sorted) + 2))
    
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best_dcg = dcg_score(y_true, y_true, k)  #
    actual_dcg = dcg_score(y_true, y_score, k)
    
    return actual_dcg / best_dcg if best_dcg > 0 else 0

ndcg = ndcg_score(y_train_np, y_train_pred_np)

print(f"R^2 Score: {r2_score:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

R^2 Score: 0.4953
MAE: 0.1158
MSE: 0.0237
RMSE: 0.1540
Precisión: 0.6180
Recall: 0.9458
NDCG@10: 1.0000
