In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import copy
import time
import os
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from itertools import product
from torch.utils.data import TensorDataset, DataLoader
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
#import dataset
df = pd.read_csv('dataset-ml-25m/dataset.csv')

In [4]:
# Dividi il dataset in feature e target
X = df.drop(['rating'], axis=1).to_numpy()
y = df['rating'].to_numpy()

# Dividi il dataset in training, validation e test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)


#count the numebr of x_train 
print("Number of train set: ", X_train.shape[0])
print("Numebr of test set: ", X_test.shape[0])
print("Number of validation set: ", X_val.shape[0])



Number of train set:  9946
Numebr of test set:  2764
Number of validation set:  1106


In [5]:
# Converti i dati in tensori PyTorch
X_train = np.array(X_train, dtype=np.float32)
X_val = np.array(X_val, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)

y_train = np.array(y_train, dtype=np.float32)
y_val = np.array(y_val, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32) 

val_dataloader = DataLoader(TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val)), batch_size=y_val.shape[0])
test_dataloader = DataLoader(TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)), batch_size=y_test.shape[0])


In [6]:
def get_model(input_size, dept=3, hidden_size=64, dropout_prob=0.2):
    model = [ nn.Linear(input_size, hidden_size), nn.ReLU() ]
    for i in range(dept):
        model.append(nn.Linear(hidden_size, hidden_size))
        model.append(nn.ReLU())
        model.append(nn.Dropout(dropout_prob))
    model.append(nn.Linear(hidden_size, 1))
    return nn.Sequential(*model)

In [7]:
#hyperparameters
hidden_size = [128, 256, 512]
dropout_prob = [0.2, 0.3, 0.4]
dept = [3, 4, 5]
epochs = 200
batch_size = [8,16,32]
learning_rate = [0.001, 0.01]

#itertools 
params = product(hidden_size, dropout_prob, dept, batch_size, learning_rate)
combinations = len(hidden_size)*len(dropout_prob)*len(dept)*len(batch_size)*len(learning_rate)
print("Number of combinations: ", combinations)


Number of combinations:  162


In [8]:
def train(model, writer, train_dataloader, val_dataloader, device, hidden_size=3, dropout_prob=0.2, dept=2, epochs=100, batch_size=32, learning_rate=.001):
    # Definisci la loss function e l'ottimizzatore
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Definisci la lista di loss
    train_loss = []
    val_loss = []

    # early stopping
    best_model = None
    best_loss = np.inf
    patience = 10
    patience_counter = 0

    # Ciclo di training
    for epoch in range(epochs):
        # Ciclo di training
        
        epoch_start = time.time()
        epoch_loss = 0

        for x, y in train_dataloader:
            x, y = x.to(device), y.to(device)
            
            # Resetta i gradienti
            optimizer.zero_grad()
            # Fai la forward pass
            y_pred = model(x)
            # Calcola la loss
            loss = criterion(y_pred, y.unsqueeze(1))
            # Fai la backward pass
            loss.backward()
            # Aggiorna i pesi
            optimizer.step()
            # Aggiorna la loss
            epoch_loss += loss.item()

        # Salva la loss media
        train_loss.append(epoch_loss / len(train_dataloader))

        # Calcola la loss sul validation set
        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for x, y in val_dataloader:
                x, y = x.to(device), y.to(device)
                y_pred = model(x)
                loss = criterion(y_pred, y.unsqueeze(1))
                epoch_val_loss += loss.item()
        val_loss.append(epoch_val_loss / len(val_dataloader))

        # Salva i risultati su TensorBoard
        writer.add_scalar('Loss/train', train_loss[-1], epoch)
        writer.add_scalar('Loss/val', val_loss[-1], epoch)

        print(f'Epoch {epoch+1}/{epochs}, Train loss: {train_loss[-1]:.4f}, Val loss: {val_loss[-1]:.4f}, Time: {time.time()-epoch_start:.2f}s')

        # early stopping
        if val_loss[-1] < best_loss:
            best_loss = val_loss[-1]
            best_model = copy.deepcopy(model)
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter == patience:
                break

    print("Training in {} epochs with best val loss: {}".format(epoch+1, best_loss))


    return best_model, train_loss, val_loss

In [9]:
def test_model(model, test_dataloader, device):
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for x, y in test_dataloader:
            x , y = x.to(device), y.to(device)
            y_pred.extend(model(x).squeeze(1).tolist())
            y_true.extend(y.tolist())
    return y_pred, y_true

In [11]:
#train model
best_model = None
best_loss = np.inf
best_config = None
iter = 0 

for hidden_size, dropout_prob, dept, batch_size, learning_rate in params:
    iter += 1
    print(f'\nIteration {iter}/{combinations}')
    print(f'hidden_size: {hidden_size}, dropout_prob: {dropout_prob}, dept: {dept}, batch_size: {batch_size}, learning_rate: {learning_rate}')

    log = f'hidden_size_{hidden_size}_dropout_prob_{dropout_prob}_dept_{dept}_batch_size_{batch_size}_learning_rate_{learning_rate}'

    if os.path.exists("results/nn/no_pca/"+log):
        print("Model already trained. Skipping...")
        continue

    writer = SummaryWriter("results/nn/no_pca/"+log)

    model = get_model(X_train.shape[1], dept=dept, hidden_size=hidden_size, dropout_prob=dropout_prob)
    
    train_dataloader = DataLoader(TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)), batch_size=batch_size, shuffle=True)

    config = {
        'hidden_size': hidden_size,
        'dropout_prob': dropout_prob,
        'dept': dept,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }

    model, train_loss, val_loss = train(model, writer, train_dataloader, val_dataloader, device, **config)

    # test model 
    y_pred, y_true = test_model(model, test_dataloader, device)
    test_loss = mean_squared_error(y_true, y_pred)
    print(f'Test loss: {test_loss:.4f} - Best Test loss: {best_loss:.4f}')

    # Salva i risultati su TensorBoard
    writer.add_hparams(config, {'hparam/loss': test_loss})
    writer.flush()

    #early stopping
    if test_loss < best_loss:
        best_loss = test_loss
        best_model = copy.deepcopy(model)
        best_config = config    

    writer.close()



Iteration 1/162
hidden_size: 128, dropout_prob: 0.2, dept: 3, batch_size: 8, learning_rate: 0.01
Epoch 1/100, Train loss: 0.2992, Val loss: 0.0263, Time: 6.61s
Epoch 2/100, Train loss: 0.0162, Val loss: 0.0131, Time: 7.28s
Epoch 3/100, Train loss: 0.0145, Val loss: 0.0136, Time: 7.27s
Epoch 4/100, Train loss: 0.0141, Val loss: 0.0180, Time: 6.99s
Epoch 5/100, Train loss: 0.0150, Val loss: 0.0080, Time: 6.82s
Epoch 6/100, Train loss: 0.0113, Val loss: 0.0160, Time: 6.98s
Epoch 7/100, Train loss: 0.0126, Val loss: 0.0087, Time: 6.69s
Epoch 8/100, Train loss: 0.0132, Val loss: 0.0115, Time: 6.73s
Epoch 9/100, Train loss: 0.0123, Val loss: 0.0095, Time: 6.77s
Epoch 10/100, Train loss: 0.0095, Val loss: 0.0085, Time: 6.93s
Epoch 11/100, Train loss: 0.0100, Val loss: 0.0107, Time: 6.77s
Epoch 12/100, Train loss: 0.0114, Val loss: 0.0097, Time: 6.73s
Epoch 13/100, Train loss: 0.0095, Val loss: 0.0080, Time: 6.79s
Epoch 14/100, Train loss: 0.0093, Val loss: 0.0122, Time: 7.06s
Epoch 15/100, T

In [12]:
print(f'Best config: {best_config}')
print(f'Best loss: {best_loss}')



Best config: {'hidden_size': 256, 'dropout_prob': 0.4, 'dept': 5, 'batch_size': 8, 'learning_rate': 0.001}
Best loss: 0.0050187974377254385


In [13]:
y_pred, y_true = test_model(best_model, test_dataloader, device)
r2 = r2_score(y_true, y_pred)
print(f'R2 score: {r2}')


R2 score: 0.9773587499655676
