In [213]:
%reset

In [214]:
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import optuna
import pandas as pd
import numpy as np
import optuna

from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor
from optuna.trial import TrialState
from sklearn.model_selection import train_test_split
from sklearn.metrics import  mean_squared_error
from optuna.trial import TrialState
from scipy import stats

In [215]:
OUTPUT_TEST = True

In [216]:
OUTPUT_TRUE_TEST = False
EPOCHS = 10
BATCH_SIZE = 64
LOSS_FN = nn.MSELoss()

In [217]:
X_train = pd.read_csv("../data/cleaned/training.csv")
y_train = pd.read_csv("../data/cleaned/training_labels.csv")
X_val = pd.read_csv("../data/cleaned/validation.csv")
y_val = pd.read_csv("../data/cleaned/validation_labels.csv")

In [218]:
columns = X_train.columns
for col in columns:
    if '[' in col or ']' in col:
        old_name = col
        col = col.replace('[', '(')
        col = col.replace(']', ')')
        
        X_train = X_train.rename(columns={old_name:col})
        X_val = X_val.rename(columns={old_name:col})

In [219]:
X_train, X_verif, y_train, y_verif = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [220]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_verif = X_verif.reset_index(drop=True)
y_verif = y_verif.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_test = y_val.reset_index(drop=True)

In [221]:
#TODO: Add sanity check for NN

In [222]:
class CustomDataset(Dataset):
    def __init__(self, features_dataframe, target_dataframe):
        self.features = features_dataframe
        self.target = target_dataframe

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # Extract features and target for the given index
        features = torch.tensor(self.features.iloc[idx].values, dtype=torch.float32)
        target = torch.tensor(self.target.iloc[idx].values, dtype=torch.float32)
        return features, target

In [223]:
train_dataset = CustomDataset(features_dataframe=X_train, target_dataframe=y_train)
verif_dataset = CustomDataset(features_dataframe=X_verif, target_dataframe=y_verif)
val_dataset = CustomDataset(features_dataframe=X_val, target_dataframe=y_val)

In [224]:
# Create data loaders.
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
verif_dataloader = DataLoader(verif_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [225]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [226]:
loss_fn = nn.MSELoss()

In [227]:
def define_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []

    input = 2808
    in_features = input
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), input, 2 * input)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, 1))

    return nn.Sequential(*layers)

In [228]:
def objective(trial):
    # Generate the model.
    model = define_model(trial).to(device)
    print(model)
    
    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)


    # Training of the model.
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}\n-------------------------------")
        model.train()
        train_error = 0
        train_size = len(train_dataset)
        for batch_idx, (X, y) in enumerate(train_dataloader):
            X, y = X.to(device), y.to(device)

            pred = model(X)
            loss = loss_fn(pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_squared_error = (pred - y)**2
            train_summed_squared_error = torch.sum(train_squared_error)
            train_error += train_summed_squared_error

        train_rmse = np.sqrt(train_error.detach().numpy() / train_size)
        
        # Validation of the model.
        model.eval()
        test_loss, avg_error = 0, 0
        verif_error = 0
        num_batches = len(verif_dataloader)
        verif_size = len(verif_dataset)
        with torch.no_grad():
            for batch_idx, (X, y) in enumerate(verif_dataloader):
                X, y = X.to(device), y.to(device)
                pred = model(X)
                test_loss += loss_fn(pred, y).item()
                try:
                    current_error = mean_squared_error(pred, y, squared=False)
                    avg_error += current_error
                    verif_squared_error = (pred - y)**2
                    verif_summed_squared_error = torch.sum(verif_squared_error)
                    verif_error += verif_summed_squared_error
                except:
                    print("WARNING: Unstable MSE")
                    # Check for NaN values
                    nan_mask = torch.isnan(pred)
                    num_nan_entries = torch.sum(nan_mask).item()
                    print("Prediction contains {num} NaN entries".format(num=num_nan_entries))
                    print("Pruning Trial")
                    raise optuna.exceptions.TrialPruned()
                
        test_loss /= num_batches #Output metric to gauge how model is doing as training happens
        avg_error /= num_batches #Output metric to gauge how model is doing as training happens

        verif_rmse = np.sqrt(verif_error / verif_size)

        accuracy = abs(verif_rmse - train_rmse) + 2 * train_rmse #Metric for optuna to determine pruning and optimal hyperparameters
        
        print(f"Test Error: \nAvg RMSE: {avg_error}, Avg loss: {test_loss:>8f}")
        trial.report(accuracy, epoch)
        print(f"Optuna accuracy: {accuracy}\n")
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

In [229]:
study = optuna.create_study(pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=2)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-11-30 18:23:56,053][0m A new study created in memory with name: no-name-5e03ecf9-e0ce-4446-b079-f979e9c5cc03[0m


Sequential(
  (0): Linear(in_features=2808, out_features=8, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.49234418479137676, inplace=False)
  (3): Linear(in_features=8, out_features=7, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.381199014716318, inplace=False)
  (6): Linear(in_features=7, out_features=1, bias=True)
)
Epoch 1
-------------------------------
Test Error: 
Avg RMSE: 13.106254471672905, Avg loss: 171.919505
Optuna accuracy: 139.4290771484375

Epoch 2
-------------------------------
Test Error: 
Avg RMSE: 10.330003632439507, Avg loss: 106.827569
Optuna accuracy: 26.2977237701416

Epoch 3
-------------------------------
Test Error: 
Avg RMSE: 8.363110542297363, Avg loss: 70.031101
Optuna accuracy: 21.396831512451172

Epoch 4
-------------------------------
Test Error: 
Avg RMSE: 6.186588128407796, Avg loss: 38.334899
Optuna accuracy: 16.691936492919922

Epoch 5
-------------------------------
Test Error: 
Avg RMSE: 5.362303680843777, Avg loss: 28.824481
Optuna accuracy: 12.7989

[32m[I 2023-11-30 18:24:03,162][0m Trial 0 finished with value: 9.033117294311523 and parameters: {'n_layers': 2, 'n_units_l0': 8, 'dropout_l0': 0.49234418479137676, 'n_units_l1': 7, 'dropout_l1': 0.381199014716318, 'optimizer': 'RMSprop', 'lr': 0.07657462938391439}. Best is trial 0 with value: 9.033117294311523.[0m


Test Error: 
Avg RMSE: 4.3789621988932295, Avg loss: 19.454231
Optuna accuracy: 9.033117294311523

Sequential(
  (0): Linear(in_features=2808, out_features=10, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.4591783975221717, inplace=False)
  (3): Linear(in_features=10, out_features=10, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.32622011638240467, inplace=False)
  (6): Linear(in_features=10, out_features=1, bias=True)
)
Epoch 1
-------------------------------
Test Error: 
Avg RMSE: 11.75962405734592, Avg loss: 139.246997
Optuna accuracy: 36.007286071777344

Epoch 2
-------------------------------
Test Error: 
Avg RMSE: 10.587979740566677, Avg loss: 112.911240
Optuna accuracy: 22.93268585205078

Epoch 3
-------------------------------
Test Error: 
Avg RMSE: 10.244612375895182, Avg loss: 105.721160
Optuna accuracy: 20.358694076538086

Epoch 4
-------------------------------
Test Error: 
Avg RMSE: 9.746447245279947, Avg loss: 95.665655
Optuna accuracy: 19.28675651550293

Epoch 5
------------

[32m[I 2023-11-30 18:24:10,533][0m Trial 1 finished with value: 14.312532424926758 and parameters: {'n_layers': 2, 'n_units_l0': 10, 'dropout_l0': 0.4591783975221717, 'n_units_l1': 10, 'dropout_l1': 0.32622011638240467, 'optimizer': 'Adam', 'lr': 0.0024146102516713337}. Best is trial 0 with value: 9.033117294311523.[0m


Test Error: 
Avg RMSE: 5.980509069230822, Avg loss: 35.895510
Optuna accuracy: 14.312532424926758

Study statistics: 
  Number of finished trials:  2
  Number of pruned trials:  0
  Number of complete trials:  2
Best trial:
  Value:  9.033117294311523
  Params: 
    n_layers: 2
    n_units_l0: 8
    dropout_l0: 0.49234418479137676
    n_units_l1: 7
    dropout_l1: 0.381199014716318
    optimizer: RMSprop
    lr: 0.07657462938391439


In [230]:
class NeuralNetwork(nn.Module):
    def __init__(self, params):
        super(NeuralNetwork, self).__init__()

        n_layers = params['n_layers']
        layer_units = [params[f'n_units_l{i}'] for i in range(n_layers)]
        layer_dropouts = [params[f'dropout_l{i}'] for i in range(n_layers)]

        # Define layers based on the provided parameters
        self.layers = nn.ModuleList()

        for i in range(n_layers):
            in_features = layer_units[i - 1] if i > 0 else 2808
            out_features = layer_units[i]

            self.layers.append(nn.Linear(in_features=in_features, out_features=out_features))
            self.layers.append(nn.Dropout(layer_dropouts[i]))
            self.layers.append(nn.BatchNorm1d(out_features))

        # Output layer
        self.output_layer = nn.Linear(in_features=layer_units[-1], out_features=1)


    def forward(self, x):
        # Define the forward pass
        for layer in self.layers:
            x = layer(x)
            x = torch.relu(x)  # You can use other activation functions based on your task

        # Output layer
        x = self.output_layer(x)

        return x

In [231]:
params = trial.params
model = NeuralNetwork(params=params)
print(model)

NeuralNetwork(
  (layers): ModuleList(
    (0): Linear(in_features=2808, out_features=8, bias=True)
    (1): Dropout(p=0.49234418479137676, inplace=False)
    (2): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=8, out_features=7, bias=True)
    (4): Dropout(p=0.381199014716318, inplace=False)
    (5): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (output_layer): Linear(in_features=7, out_features=1, bias=True)
)


In [232]:
op_name = params['optimizer']
if op_name == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
elif op_name == 'RMSprop':
    optimizer = torch.optim.RMSprop(model.parameters(), lr=params['lr'])
elif op_name == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'])
else:
    raise ValueError("Optimizer name not found. Ensure it is added to the list above.")

In [233]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 9 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [234]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    preds = []
    true = []
    model.eval()
    test_loss, error = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            preds.append(list(pred.numpy()))
            true.append(list(y.numpy()))
            test_loss += loss_fn(pred, y).item()
            error += mean_squared_error(pred, y, squared=False)
    test_loss /= num_batches
    error /= num_batches
    print(f"Test Error: \nAvg RMSE: {error}, Avg loss: {test_loss:>8f} \n")
    return preds, true

In [235]:
for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(val_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 345.594543  [   64/ 5150]
loss: 32.515858  [  640/ 5150]
loss: 23.879959  [ 1216/ 5150]
loss: 14.749730  [ 1792/ 5150]
loss: 19.936762  [ 2368/ 5150]
loss: 11.156807  [ 2944/ 5150]
loss: 23.845108  [ 3520/ 5150]
loss: 22.476679  [ 4096/ 5150]
loss: 27.426199  [ 4672/ 5150]
Test Error: 
Avg RMSE: 4.2601378957430525, Avg loss: 18.829528 

Epoch 2
-------------------------------
loss: 24.125467  [   64/ 5150]
loss: 15.041159  [  640/ 5150]
loss: 19.356709  [ 1216/ 5150]
loss: 13.431541  [ 1792/ 5150]
loss: 19.943216  [ 2368/ 5150]
loss: 10.170897  [ 2944/ 5150]
loss: 22.797131  [ 3520/ 5150]
loss: 21.633530  [ 4096/ 5150]
loss: 27.035475  [ 4672/ 5150]
Test Error: 
Avg RMSE: 4.225183506806691, Avg loss: 18.482717 

Epoch 3
-------------------------------
loss: 24.526281  [   64/ 5150]
loss: 15.641088  [  640/ 5150]
loss: 18.050133  [ 1216/ 5150]
loss: 12.800880  [ 1792/ 5150]
loss: 19.714518  [ 2368/ 5150]
loss: 11.350188  [ 2944/ 5150]
loss: 

In [236]:
train_preds, train_true = test(train_dataloader, model, loss_fn)

Test Error: 
Avg RMSE: 4.198883683593185, Avg loss: 18.096174 



In [237]:
train_preds = np.concatenate(train_preds).ravel()
train_true = np.concatenate(train_true).ravel()

In [238]:
if not OUTPUT_TEST:
    raise ValueError("OUTPUT_TEST set to False. If you would like to output final test values set to True and continue running from here")

In [239]:
X_test = pd.read_csv("../data/cleaned/test.csv")
y_test = pd.read_csv("../data/cleaned/test_labels.csv")

In [240]:
columns = X_test.columns
for col in columns:
    if '[' in col or ']' in col:
        old_name = col
        col = col.replace('[', '(')
        col = col.replace(']', ')')
        
        X_test = X_test.rename(columns={old_name:col})

In [241]:
test_dataset = CustomDataset(features_dataframe=X_test, target_dataframe=y_test)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [242]:
test_pred, test_true = test(test_dataloader, model, loss_fn)

Test Error: 
Avg RMSE: 4.530055443445842, Avg loss: 20.935884 



In [243]:
test_pred = np.concatenate(test_pred).ravel()
test_true = np.concatenate(test_true).ravel()

In [244]:
error = mean_squared_error(test_true,test_pred,squared=False)
print("RMSE:", error)

RMSE: 4.6317825


In [245]:
#Save test true vals and predictions to csv

pred_data = pd.DataFrame(test_pred)
pred_filepath = '../data/predictions/NN/test_pred_nn.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
pred_data = pd.DataFrame(test_true)
pred_filepath = '../data/predictions/NN/test_true_nn.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)

#Save train true vals and predictions to csv

pred_data = pd.DataFrame(train_preds)
pred_filepath = '../Data/Predictions/NN/train_pred_nn.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
pred_data = pd.DataFrame(train_true)
pred_filepath = '../data/predictions/NN/train_true_nn.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)


In [246]:
#Save inputs to csv

pred_data = pd.DataFrame(X_train)
pred_filepath = '../data/predictions/NN/train_input_nn.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
true_data = pd.DataFrame(X_test)
true_filepath = '../data/predictions/NN/test_input_nn.csv'
true_data.to_csv(true_filepath, index=False, header=False)

In [247]:
#Read in values from csv and calculate RMSE and r values

test_pred_data = np.genfromtxt('../data/predictions/NN/test_pred_nn.csv', delimiter=',', filling_values=np.nan)
test_true_data = np.genfromtxt('../data/predictions/NN/test_true_nn.csv', delimiter=',', filling_values=np.nan)
train_pred_data = np.genfromtxt('../data/predictions/NN/train_pred_nn.csv', delimiter=',', filling_values=np.nan)
train_true_data = np.genfromtxt('../data/predictions/NN/train_true_nn.csv', delimiter=',', filling_values=np.nan)

test_rmse = mean_squared_error(test_true_data,test_pred_data,squared=False)
test_r = stats.pearsonr(test_true_data,test_pred_data)

train_rmse = mean_squared_error(train_true_data,train_pred_data,squared=False)
train_r = stats.pearsonr(train_true_data,train_pred_data)

print("Train:")
print(train_rmse)
print('Test:')
print(test_rmse)
print(test_r)

Train:
4.252383160967171
Test:
4.631782652544545
PearsonRResult(statistic=0.3516642696602043, pvalue=2.8666309231772304e-22)
