In [1]:
!wget -N -P dataset https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/train_data.csv
!wget -N -P dataset https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/test_data.csv

--2022-04-24 19:32:39--  https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/train_data.csv
Resolving gitlab.dei.unipd.it (gitlab.dei.unipd.it)... 147.162.2.85
Connecting to gitlab.dei.unipd.it (gitlab.dei.unipd.it)|147.162.2.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3776 (3,7K) [text/plain]
Saving to: ‘dataset/train_data.csv’


Last-modified header missing -- time-stamps turned off.
2022-04-24 19:32:40 (98,6 MB/s) - ‘dataset/train_data.csv’ saved [3776/3776]

--2022-04-24 19:32:41--  https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/test_data.csv
Resolving gitlab.dei.unipd.it (gitlab.dei.unipd.it)... 147.162.2.85
Connecting to gitlab.dei.unipd.it (gitlab.dei.unipd.it)|147.162.2.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3788 (3,7K) [text/plain]
Saving to: ‘dataset/test_data.csv’


Last-modified header missing -- time-stamps turned off.
2022-04-24 19:

In [1]:
%load_ext autoreload
%autoreload 2

from torchvision import transforms
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch import optim, nn
import torch
from nn_tools import *
import random
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import optuna
pd.options.plotting.backend = "plotly"


device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device in use: {device}")

seed = 2009440

random.seed(seed)
torch.manual_seed(seed)

Training device in use: cuda


<torch._C.Generator at 0x7f3fb7f2b9b0>

In [2]:
reg_train = pd.read_csv("./dataset/train_data.csv")
reg_test = pd.read_csv("./dataset/test_data.csv")

reg_layout = go.Layout(
    width=700, height=500,
    font={'color': '#000000', 'size': 14}, title='Regression Data',
    xaxis={'zerolinewidth': 1, 'zerolinecolor': 'white', },
    yaxis={'zerolinewidth': 1, 'zerolinecolor': 'white', }, legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01), margin=dict(l=20, r=20, t=50, b=20)
)
plot_fig = go.Figure(layout=reg_layout)

for df, label in zip([reg_train, reg_test], ["Training data", "Test data"]):
    plot_fig.add_trace(go.Scatter(
        x=df.input, y=df.label, name=label, mode='markers'))
plot_fig.show()

The training data is noisier. This means that overfitting will be extremely obvious in case it does happen. Not only that, but the gaps are large enough that they are the regions where minimizing loss will be crucial. If the model overfits, these regions will be the ones with larger loss.

### Preparing data for PyTorch (i.e. converting to tensors)

I modified the ```CsvDataset``` class to admit pandas dataframes directly (as long as they have the format ```['input'], ['label']```), which gives me more flexibility on wrangling the data beforehand, and not having to save the cleaned up version in a new CSV file.

In [3]:
composed_transform = transforms.Compose([ToTensor()])

train_dataset = pd_dataset(reg_train, transform=composed_transform)
test_dataset = pd_dataset(reg_test, transform=composed_transform)

train_dataloader = DataLoader(
    train_dataset, batch_size=4, shuffle=True, num_workers=0)

test_dataloader = DataLoader(test_dataset, batch_size=len(
    test_dataset), shuffle=False, num_workers=0)


### Model class definition, and subsequent instantiation.

**Note:** I included the training loop as a class method (in the style of Keras/sklearn), in order to simplify the code for the grid search of hyper-parameters.

The neural network class is defined in ```nn_tools.py```

In [4]:
def objective(trial):
    params = {
        "N_h1" : trial.suggest_int("N_h1", 80, 150),
        "N_h2" : trial.suggest_int("N_h2", 160, 250),
        "N_h3" : trial.suggest_int("N_h3", 25, 100),
        "activation" : "Sigmoid", #trial.suggest_categorical("activation", ["ReLU", "Sigmoid", "LeakyReLU"]),
        "dropout" : trial.suggest_uniform("Dropout", 0.0, 0.7),
    }
    # Define the model
    model = reg_model(1,params, 1, device)
    model.to(device)
    # Define the loss function
    loss_fn = nn.MSELoss()
    
    # Define the optimizer
    optim = getattr(
        torch.optim, trial.suggest_categorical("optim", ["Adam", "SGD", "AdamW"])
            )(model.parameters(),
               lr=trial.suggest_loguniform("lr", 1e-5, 1e-2)
            )
    
    # Train and evaluate model
    val_loss = model.train_model(
        train_dataloader, test_dataloader,
        num_epochs=200, loss_fn=loss_fn, optimizer=optim, verbose=False
        )
    
    return val_loss



In [5]:
# Given that we are attempting 100 different models,
# We're setting verbosity to the lowest level.
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Optuna uses TPESampler to sample the parameters.
# This is a bayesian method that compromises between
# a grid search and a random search.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

for item in study.best_params:
    print(f"{item} : {study.best_params[item]}")

In [77]:
best_params = dict()
best_params['N_h1'] = study.best_params['N_h1']
best_params['N_h2'] = study.best_params['N_h2']
best_params['N_h3'] = study.best_params['N_h3']
best_params['activation'] = "LeakyReLU" #study.best_params['activation']
best_params['dropout'] = study.best_params['Dropout']

regression = reg_model(1, best_params, 1, device)


regression.to(device)

loss_fn = nn.MSELoss()
optimizer = getattr(
    torch.optim, study.best_params['optim'])(
        regression.parameters(), lr=study.best_params['lr']*10, weight_decay=1e-5)
num_epochs = 400


In [78]:
best_model_loss = regression.cross_validate(
    train_dataloader, test_dataloader, num_epochs, loss_fn, optimizer, k=5)

Fold 1/5...	Done.
Fold 2/5...	Done.
Fold 3/5...	Done.
Fold 4/5...	Done.
Fold 5/5...	Done.


In [70]:
regression.fold_loss

array([ 9.24412918,  9.13479996, 10.1115942 , 10.50296116, 10.73175144])

### Model training loop

In [71]:
regression.train_model(train_dataloader, test_dataloader,
                       num_epochs, loss_fn, optimizer)

Epoch: 400 >>> Training loss: 0.40018 | Validation loss: 0.22272

0.22271982

In [72]:
reg_df = pd.DataFrame(regression.history).melt(
    id_vars='epoch', var_name='loss_type', value_name='loss')

fig = px.line(reg_df, x='epoch', y='loss',
              color='loss_type', title='Training timeline', width=800, height=400)
fig.update_layout(paper_bgcolor='#FFFFFF', font={'color': 'black', 'size': 16}, legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
), margin=dict(l=20, r=20, t=50, b=20))


In [73]:
# Input vector
x_vec = torch.linspace(-5, 5, 100000)
x_vec = x_vec.to(device)
x_vec = x_vec.unsqueeze(-1)  # Adding a dimension to the input vector

# Network output
# eval() acts as switch for some specific layers/parts of the model that behave
# differently during training and inference (eval) time. For example, Dropout
# BatchNorm etc.
regression.eval()
with torch.no_grad():  # turn off gradients computation
    y_vec = regression(x_vec)

# Convert x_vec and y_vec to numpy one dimensional arrays
x_vec = x_vec.squeeze().cpu().numpy()
y_vec = y_vec.squeeze().cpu().numpy()


model_fig = go.Figure(layout=reg_layout)

for df, label in zip([reg_train, reg_test], ["Training data", "Test data"]):
    model_fig.add_trace(go.Scatter(
        x=df.input, y=df.label, name=label, mode='markers'))

model_fig.add_trace(
    go.Scatter(x=x_vec, y=y_vec, name='Model fit', mode='lines', line=dict(color='green')))

# model_fig.update_layout(title=f"Model output \n <br><sup>Epoch {num_epochs};\
#        validation loss = {regression.history['valid'][-1]:.3f}</sup>",
#        legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01),
#        margin=dict(l=20, r=20, t=50, b=20))
model_fig.show()

# Grid (random) search and K-Fold cross validation.

I write the grid-search process myself, varying the number of neurons in each layer (but keeping the structure intact - always 3 hidden layers).

All in all, I expect only the change in the number of neurons to help reduce the loss minimum. For that, I will sample the number of neurons for each hidden layer from a discrete uniform random variable $\mathcal{U}(1,200)$

The loss will, understandably, be kept constant, to ensure comparability.

The K-Fold cross validation uses the ```sklearn.model_selection.KFold``` method.

In [12]:
hyperparams = dict()
hyperparams['neurons'] = range(1, 300)
hyperparams['optimizer'] = ["Adam", "AdamW", "SGD"]
hyperparams['lr'] = [0.1, 0.01, 0.001]


In [13]:
params_df = reg_grid_search(
    train_dataloader, val_dataloader, hyperparams, 5, device, num_epochs=500)
params_df.head()

Network initialized
Network initialized
Network initialized
Network initialized
Network initialized
Best set of parameters in the grid:
    h1   h2   h3 optimizer  learning_rate  final_training_loss  final_val_loss
5  110  175  254     AdamW          0.001             0.250201        0.369841


Unnamed: 0,h1,h2,h3,optimizer,learning_rate,final_training_loss,final_val_loss
0,110,175,254,Adam,0.1,6.464693,8.215092
1,110,175,254,Adam,0.01,0.964706,2.77647
2,110,175,254,Adam,0.001,0.214311,0.42998
3,110,175,254,AdamW,0.1,12.157694,16.082638
4,110,175,254,AdamW,0.01,0.428292,0.884187


In [14]:
best_params = params_df[params_df['final_val_loss']
                        == params_df['final_val_loss'].min()]

best_row = best_params.iloc[0]
hidden_layers = (best_row.h1, best_row.h2, best_row.h3)
best_opt = getattr(optim, best_row.optimizer)
best_lr = best_row.learning_rate

best_params

Unnamed: 0,h1,h2,h3,optimizer,learning_rate,final_training_loss,final_val_loss
5,110,175,254,AdamW,0.001,0.250201,0.369841


In [15]:
(110, 175, 254, 'AdamW', 0.001)

best_model = reg_model(1, *hidden_layers, 1, device)
best_model.to(device)

best_opt = best_opt(best_model.parameters(), lr=best_lr, weight_decay=1e-5)
best_model.train_model(train_dataloader, val_dataloader,
                       1500, loss_fn, best_opt)

# Input vector
x_vec = torch.linspace(-5, 5, 1000)
x_vec = x_vec.to(device)
x_vec = x_vec.unsqueeze(-1)

with torch.no_grad():  # turn off gradients computation
    y_vec = best_model(x_vec)

# Convert x_vec and y_vec to numpy one dimensional arrays
x_vec = x_vec.squeeze().cpu().numpy()
y_vec = y_vec.squeeze().cpu().numpy()

Network initialized
Epoch: 1499 >>> Training loss: 0.21555 | Validation loss: 0.35441

In [16]:
test_loss = best_model.evaluate(test_dataloader, loss_fn, verbose=False)

best_model_fig = go.Figure(layout=reg_layout)

for df, label in zip([reg_train, reg_test], ["Training data", "Test data"]):
    best_model_fig.add_trace(go.Scatter(
        x=df.input, y=df.label, name=label, mode='markers'))

best_model_fig.add_trace(go.Scatter(
    x=x_vec, y=y_vec, name='Model fit', mode='lines', line=dict(color='green')))

best_model_fig.update_layout(title=f"Model output \n <br><sup>1500 training epochs;\
       Test loss = {test_loss:.3f}</sup>",
       legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
       margin=dict(l=20, r=20, t=50, b=20)
       )
best_model_fig.show()

In [17]:
import os

if not os.path.exists("images"):
    os.mkdir("images")

fig_list = [plot_fig, fig, model_fig, best_model_fig]

for i, figure in enumerate(fig_list):
    figure.write_image(f"images/fig{i}.pdf")