In [32]:
#import
import os
import copy
import time
import torch
import random
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from itertools import product
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter


In [33]:
def fix_random(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
fix_random(42)

In [71]:
pca_t = False

In [72]:
device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [35]:
df= pd.read_csv('ml-25m/dataset.csv')

Data Preprocessing

In [73]:
X= df.drop(['rating'], axis=1).to_numpy()
y= df['rating'].to_numpy()

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Number of tran set:", X_train.shape[0])
print("Number of val set:", X_val.shape[0])
print("Number of test set:", X_test.shape[0])

Number of tran set: 8841
Number of val set: 2211
Number of test set: 2764


In [74]:
if pca_t == True:
    pca= PCA(n_components= 0.95)
    pca.fit(X_train)
    X_train= pca.transform(X_train)
    X_val= pca.transform(X_val)
    X_test= pca.transform(X_test)
    print("PCA is applied")
else:
    print("PCA is not applied")

PCA is not applied


In [75]:
X_train= np.array(X_train, dtype=np.float32)
X_val= np.array(X_val, dtype=np.float32)
X_test= np.array(X_test, dtype=np.float32)

y_train= np.array(y_train, dtype=np.float32)
y_val= np.array(y_val, dtype=np.float32)
y_test= np.array(y_test, dtype=np.float32)

val_dataloader= DataLoader(TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val)), batch_size= y_val.shape[0])
test_dataloader= DataLoader(TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)), batch_size= y_test.shape[0])

In [76]:
def get_model(input_size, dept= 3, hidden_size= 64, dropout_prob= 0.2):
    model= [nn.Linear(input_size, hidden_size), nn.ReLU()]
    for i in range(dept):
        model.append(nn.Linear(hidden_size, hidden_size))
        model.append(nn.ReLU())
        model.append(nn.Dropout(dropout_prob))
    model.append(nn.Linear(hidden_size, 1))
    return nn.Sequential(*model)

In [77]:
#hyperparameters
hidden_size= [64, 128, 256, 512]
dropout_prob= [0.2, 0.3]
dept= [3, 4, 5]
epochs= [200]
batch_size= [8, 16, 32]
lr= [0.001, 0.01]

# itertools
params = product(hidden_size, dropout_prob, dept, batch_size, lr)
combinations = len(hidden_size) * len(dropout_prob) * len(dept) * len(batch_size) * len(lr)
print("Number of combinations:", combinations)


Number of combinations: 144


In [78]:
def train(model, writer, train_dataloader, val_dataloader, device, epochs= 100, hidden_size= 3, dropout_prob= 0.2, dept= 2, batch_size= 32, lr= 0.001):
    best_model= None
    best_loss= np.inf
    patience= 10
    p_counter= 0

    train_loss= []
    val_loss= []

    criterion= nn.MSELoss()
    optimizer= optim.Adam(model.parameters(), lr= lr)

    for epoch in epochs:
        epoch_start= time.time()
        epoch_loss= 0.0

        for x, y in train_dataloader:
            x= x.to(device)
            y= y.to(device)

            optimizer.zero_grad()
            y_pred= model(x)
            loss= criterion(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            epoch_loss+= loss.item()

        train_loss.append(epoch_loss/len(train_dataloader))

        model.eval()
        epoch_val_loss= 0.0
        with torch.no_grad():
            for x, y in val_dataloader:
                x= x.to(device)
                y= y.to(device)

                y_pred= model(x)
                loss= criterion(y_pred, y.unsqueeze(1))
                epoch_val_loss+= loss.item()
        val_loss.append(epoch_val_loss/len(val_dataloader))

        writer.add_scalar('Loss/train', train_loss[-1], epoch)
        writer.add_scalar('Loss/val', val_loss[-1], epoch)

        print(f'Epoch {epoch+1}/{epochs}, Loss: {train_loss[-1]:.4f}, Val Loss: {val_loss[-1]:.4f}, Time: {time.time()-epoch_start:.2f}s')

        if val_loss[-1] < best_loss:
            best_loss= val_loss[-1]
            best_model= copy.deepcopy(model)
            p_counter= 0
        else:
            p_counter+= 1
            if patience == p_counter:
                break
    
    print('Trauing in {} epochs with best val loss:{}'. format(len(train_loss), best_loss))

    return best_model, train_loss, val_loss

In [79]:
def test_model(model, test_dataloader, device):
    model.eval()
    y_pred= []
    y_true= []
    with torch.no_grad():
        for x, y in test_dataloader:
            x= x.to(device)
            y= y.to(device)

            y_pred.extend(model(x).squeeze(1).tolist())
            y_true.extend(y.tolist())
    return y_true, y_pred

In [80]:
best_model= None
best_loss= np.inf
best_combination= None
iteration= 0

for hidden_size, dropout_prob, dept, batch_size, lr in params:
    iteration+= 1
    print(f'Iteration {iteration}/{combinations}')
    print(f'hidden_size: {hidden_size}, dropout_prob: {dropout_prob}, dept: {dept}, batch_size: {batch_size}, lr: {lr}')

    log= f'hidden_size: {hidden_size}, dropout_prob: {dropout_prob}, dept: {dept}, batch_size: {batch_size}, lr: {lr}'

    if pca_t == True:
        if os.path.exists('risultati/nn/pca'+log):
            print('Model alredy trained. Skipping...')
            continue
        writer= SummaryWriter('risultati/nn/pca/'+log)
    else:
        if os.path.exists('risultati/nn/no_pca/'+log):
            print('Model alredy trained. Skipping...')
            continue
        writer= SummaryWriter('risultati/nn/no_pca'+log)

    model= get_model(X_train.shape[1], dept= dept, hidden_size= hidden_size, dropout_prob= dropout_prob)

    train_loader = DataLoader(TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)), batch_size= batch_size, shuffle= True)

    config= {'hidden_size': hidden_size, 
             'dropout_prob': dropout_prob, 
             'dept': dept, 
             'batch_size': batch_size, 
             'lr': lr}
    
    model, train_loss, val_loss = train(model, writer, train_loader, val_dataloader, device, epochs, **config)

    y_pred, y_true= test_model(model, test_dataloader, device)
    test_loss= mean_squared_error(y_true, y_pred)
    print('Test loss: {test_loss:.4f} - Best loss: {best_loss:.4f}')

    writer.add_hparams(config, {'hparam/test_loss': test_loss})
    writer.flush()

    if test_loss < best_loss:
        best_loss= test_loss
        best_model= model
        best_combination= config
    
    writer.close()


Iteration 1/144
hidden_size: 64, dropout_prob: 0.2, dept: 3, batch_size: 8, lr: 0.001


Epoch 201/[200], Loss: 0.4582, Val Loss: 0.0784, Time: 6.85s
Trauing in 1 epochs with best val loss:0.07842180132865906
Test loss: {test_loss:.4f} - Best loss: {best_loss:.4f}
Iteration 2/144
hidden_size: 64, dropout_prob: 0.2, dept: 3, batch_size: 8, lr: 0.01
Epoch 201/[200], Loss: 0.2423, Val Loss: 0.0320, Time: 4.05s
Trauing in 1 epochs with best val loss:0.03195612132549286
Test loss: {test_loss:.4f} - Best loss: {best_loss:.4f}
Iteration 3/144
hidden_size: 64, dropout_prob: 0.2, dept: 3, batch_size: 16, lr: 0.001
Epoch 201/[200], Loss: 0.7549, Val Loss: 0.0764, Time: 1.94s
Trauing in 1 epochs with best val loss:0.07644930481910706
Test loss: {test_loss:.4f} - Best loss: {best_loss:.4f}
Iteration 4/144
hidden_size: 64, dropout_prob: 0.2, dept: 3, batch_size: 16, lr: 0.01
Epoch 201/[200], Loss: 0.6060, Val Loss: 0.0875, Time: 2.48s
Trauing in 1 epochs with best val loss:0.08751649409532547
Test loss: {test_loss:.4f} - Best loss: {best_loss:.4f}
Iteration 5/144
hidden_size: 64, dropo

In [81]:
print('Best combination:', best_combination)
print('Best loss:', best_loss)

Best combination: {'hidden_size': 512, 'dropout_prob': 0.3, 'dept': 4, 'batch_size': 16, 'lr': 0.001}
Best loss: 0.01622953643687759


In [82]:
y_pred, y_true= test_model(best_model, test_dataloader, device)

mse= mean_squared_error(y_true, y_pred)
r2= r2_score(y_true, y_pred)

print('MSE:', mse)
print('R2:', r2)

MSE: 0.01622953643687759
R2: 0.9010470547932781
