<a href="https://colab.research.google.com/github/jtfreitas/NN_DL/blob/main/HW1/regression_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Get required datasets and modules

In [1]:
!wget -N -P dataset https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/train_data.csv
!wget -N -P dataset https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/test_data.csv
!wget -O nn_tools.py https://raw.githubusercontent.com/jtfreitas/NN_DL/main/HW1/regression/nn_tools.py
!pip install -q optuna
!pip install -U kaleido

--2022-06-29 21:24:30--  https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/train_data.csv
Resolving gitlab.dei.unipd.it (gitlab.dei.unipd.it)... 147.162.2.85
Connecting to gitlab.dei.unipd.it (gitlab.dei.unipd.it)|147.162.2.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3776 (3.7K) [text/plain]
Saving to: ‘dataset/train_data.csv’


Last-modified header missing -- time-stamps turned off.
2022-06-29 21:24:30 (83.1 MB/s) - ‘dataset/train_data.csv’ saved [3776/3776]

--2022-06-29 21:24:30--  https://gitlab.dei.unipd.it/michieli/nnld-2021-22-lab-resources/-/raw/main/homework1/test_data.csv
Resolving gitlab.dei.unipd.it (gitlab.dei.unipd.it)... 147.162.2.85
Connecting to gitlab.dei.unipd.it (gitlab.dei.unipd.it)|147.162.2.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3788 (3.7K) [text/plain]
Saving to: ‘dataset/test_data.csv’


Last-modified header missing -- time-stamps turned off.
2022-06-29 21:

In [2]:
%load_ext autoreload
%autoreload 2

from torchvision import transforms
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch import optim, nn
import torch
import nn_tools as nt
import random
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import optuna
pd.options.plotting.backend = "plotly"


device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device in use: {device}")

seed = 2009440

random.seed(seed)
torch.manual_seed(seed)

Training device in use: cuda


<torch._C.Generator at 0x7f6591c7ea90>

In [3]:
reg_train = pd.read_csv("./dataset/train_data.csv")
reg_test = pd.read_csv("./dataset/test_data.csv")

reg_layout = go.Layout(
    width=700, height=500,
    font={'color': '#000000', 'size': 20}, title='Regression Data',
    xaxis={'zerolinewidth': 1, 'zerolinecolor': 'white', },
    yaxis={'zerolinewidth': 1, 'zerolinecolor': 'white', }, legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01), margin=dict(l=20, r=20, t=50, b=20)
)
plot_fig = go.Figure(layout=reg_layout)

for df, label in zip([reg_train, reg_test], ["Training data", "Test data"]):
    plot_fig.add_trace(go.Scatter(
        x=df.input, y=df.label, name=label, mode='markers'))
plot_fig.show()

The training data is noisier. This means that overfitting will be extremely obvious in case it does happen. Not only that, but the gaps are large enough that they are the regions where minimizing loss will be crucial. If the model overfits, these regions will be the ones with larger loss.

### Preparing data for PyTorch (i.e. converting to tensors)

I modified the ```CsvDataset``` class to admit pandas dataframes directly (as long as they have the format ```['input'], ['label']```), which gives me more flexibility on wrangling the data beforehand, and not having to save the cleaned up version in a new CSV file.

In [4]:
composed_transform = transforms.Compose([nt.ToTensor()])

train_dataset = nt.pd_dataset(reg_train, transform=composed_transform)
test_dataset = nt.pd_dataset(reg_test, transform=composed_transform)

train_dataloader = DataLoader(
    train_dataset, batch_size=4, shuffle=True, num_workers=0)

test_dataloader = DataLoader(test_dataset, batch_size=len(
    test_dataset), shuffle=False, num_workers=0)


In [5]:
def objective(trial):
    params = {
        "N_h1" : trial.suggest_int("N_h1", 80, 150),
        "N_h2" : trial.suggest_int("N_h2", 160, 250),
        "N_h3" : trial.suggest_int("N_h3", 25, 100),
        "activation" : trial.suggest_categorical("activation", ["ReLU", "Sigmoid", "LeakyReLU"]),
        "dropout" : trial.suggest_uniform("Dropout", 0.0, 0.7),
    }
    # Define the model
    model = nt.reg_model(1,params, 1, device)
    model.to(device)
    # Define the loss function
    loss_fn = nn.MSELoss()
    
    # Define the optimizer
    optim = getattr(
        torch.optim, trial.suggest_categorical("optim", ["Adam", "SGD", "AdamW"])
            )(model.parameters(),
               lr=trial.suggest_loguniform("lr", 1e-5, 1e-2)
            )
    
    # Train and evaluate model
    val_loss = model.train_model(
        train_dataloader, test_dataloader,
        num_epochs=200, loss_fn=loss_fn, optimizer=optim, verbose=False
        )
    
    return val_loss



In [6]:
# Given that we are attempting 50 different models,
# We're setting verbosity to the lowest level.
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Optuna uses TPESampler to sample the parameters.
# This is a bayesian method that compromises between
# a grid search and a random search.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

for item in study.best_params:
    print(f"{item} : {study.best_params[item]}")

N_h1 : 138
N_h2 : 190
N_h3 : 81
activation : ReLU
Dropout : 0.026742459024788553
optim : Adam
lr : 0.0005230081699854111


In [7]:
best_params = dict()
best_params['N_h1'] = study.best_params['N_h1']
best_params['N_h2'] = study.best_params['N_h2']
best_params['N_h3'] = study.best_params['N_h3']
best_params['activation'] = study.best_params['activation']
best_params['dropout'] = study.best_params['Dropout']

regression = nt.reg_model(1, best_params, 1, device)


regression.to(device)

loss_fn = nn.MSELoss()
optimizer = getattr(
    torch.optim, study.best_params['optim'])(
        regression.parameters(), lr=study.best_params['lr'],
        weight_decay=1e-5)
num_epochs = 400


In [8]:
k = 5

best_model_loss = regression.cross_validate(
    train_dataloader, test_dataloader, num_epochs, loss_fn, optimizer, k)

Fold 1/5...	Done.
Fold 2/5...	Done.
Fold 3/5...	Done.
Fold 4/5...	Done.
Fold 5/5...	Done.


In [9]:
reg_df = pd.DataFrame(regression.history).melt(
    id_vars='epoch', var_name='Loss type', value_name='Loss')

plot_title = f"Training timeline \n <br><sup> Average on {k} folds </sup>"

fig = px.line(reg_df, x='epoch', y='Loss',
              color='Loss type', title=plot_title, width=800, height=400)
fig.update_layout(paper_bgcolor='#FFFFFF', font={'color': 'black', 'size': 18}, legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
), margin=dict(l=20, r=20, t=50, b=20))


In [10]:
# Input vector
x_vec = torch.linspace(-5, 5, 100000).unsqueeze(-1).to(device)
x_vec = x_vec.to(device)
x_vec = x_vec.unsqueeze(-1)  # Adding a dimension to the input vector

# Network output
# eval() acts as switch for some specific layers/parts of the model that behave
# differently during training and inference (eval) time. For example, Dropout
# BatchNorm etc.
regression.eval()
with torch.no_grad():  # turn off gradients computation
    y_vec = regression(x_vec)


# Convert x_vec and y_vec to numpy one dimensional arrays
x_vec = x_vec.squeeze().cpu().numpy()
y_vec = y_vec.squeeze().cpu().numpy()


test_data = torch.from_numpy(test_dataloader.dataset.data[:,0]).float().to(device).unsqueeze(-1)
test_labels = torch.from_numpy(test_dataloader.dataset.data[:,1]).float().to(device)

regression.eval()
with torch.no_grad():  # turn off gradients computation
    y_hat = regression(test_data).squeeze()


test_loss = loss_fn(y_hat, test_labels)


model_fig = go.Figure(layout=reg_layout)

for df, label in zip([reg_train, reg_test], ["Training data", "Test data"]):
    model_fig.add_trace(go.Scatter(
        x=df.input, y=df.label, name=label, mode='markers'))

model_fig.add_trace(
    go.Scatter(x=x_vec, y=y_vec, name='Model inference', mode='lines', line=dict(color='green')))

model_fig.update_layout(title=f"Model output \n <br><sup>Epoch {num_epochs};\
       Loss on test set = {test_loss:.3f}</sup>",
       legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01),
       font=dict(size=18),
       margin=dict(l=20, r=20, t=50, b=20))
model_fig.show()

Save the images

In [11]:
import os

if not os.path.exists("images"):
    os.mkdir("images")

fig_list = [plot_fig, fig, model_fig]

for i, figure in enumerate(fig_list):
    figure.write_image(f"images/fig{i}.pdf")