In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from utility_functions import SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset
from joblib import Parallel, delayed

from utility_functions import SquaredHingeLoss
from MLP import MLPModel, mlp_training

In [2]:
# PATHs (edit these paths depending on dataset)
dataset = 'detailed'

# training data
fold_path = 'training_data/' + dataset + '/folds.csv'
inputs_path = 'training_data/' + dataset + '/inputs.csv'
outputs_path = 'training_data/' + dataset + '/outputs.csv'
evaluation_path = 'training_data/' + dataset + '/evaluation.csv'

# raw dfs
fold = 1
fold_df = pd.read_csv(fold_path)
inputs_df = pd.read_csv(inputs_path)
outputs_df = pd.read_csv(outputs_path)
evaluation_df = pd.read_csv(evaluation_path)

# fold dfs
inputs_fold1_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == 1]['sequenceID'])]
outputs_fold1_df = outputs_df[outputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == 1]['sequenceID'])]

# feature engineering transformation
identity = lambda x: x
log      = lambda x: np.log(x)
loglog   = lambda x: np.log(np.log(x))

In [3]:
chosen_feature = ['length']
f_engineering  = [loglog]

In [4]:
def get_input_tensor(inputs_df, chosen_feature, f_engineer):
    inputs = inputs_df[chosen_feature].to_numpy()
    for i in range(len(f_engineer)):
        inputs[:, i] = f_engineer[i](inputs[:, i])
    inputs = torch.Tensor(inputs)
    return inputs

In [5]:
# normalzie
def normalize_data(tensor):
    # Calculate mean and standard deviation along the feature dimension
    mean = torch.mean(tensor, dim=0)
    std = torch.std(tensor, dim=0)

    # Normalize the tensor
    normalized_tensor = (tensor - mean) / std

    return normalized_tensor

In [6]:
def mlp_training(inputs_df, outputs_df, hidden_layers, hidden_size, chosen_feature, f_engineer, normalize, batch_size, margin, n_ites, verbose):
    # inputs
    inputs = inputs_df[chosen_feature].to_numpy()

    # feature engineering
    for i in range(len(f_engineer)):
        inputs[:, i] = f_engineer[i](inputs[:, i])
    inputs = torch.Tensor(inputs)

    # normalize input
    if normalize == 1:
        inputs = normalize_data(inputs)

    # outputs
    targets_low  = torch.Tensor(outputs_df['min.log.lambda'].to_numpy().reshape(-1,1))
    targets_high = torch.Tensor(outputs_df['max.log.lambda'].to_numpy().reshape(-1,1))
    outputs = torch.cat((targets_low, targets_high), dim=1)

    # prepare training dataset
    dataset    = TensorDataset(inputs, outputs)
    dataloader = DataLoader(dataset, batch_size, shuffle=False)

    # Instantiate model, loss function and optimizer
    model = MLPModel(inputs.shape[1], hidden_layers, hidden_size)
    criterion = SquaredHingeLoss(margin)
    optimizer = optim.Adam(model.parameters())

    # Initialize early stopping parameters
    best_loss = float('inf')
    patience = 5  # Number of epochs to wait before early stopping
    num_bad_epochs = 0

    # Training loop
    for epoch in range(n_ites):
        for features, labels in dataloader:
            optimizer.zero_grad()
            loss = criterion(model(features), labels)
            loss.backward()
            optimizer.step()

        # Calculate validation loss
        val_loss = criterion(model(inputs), outputs)
        if verbose==1:
            print(f"{epoch}, loss: {val_loss}")

        # Check for early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            num_bad_epochs = 0
        else:
            num_bad_epochs += 1
            if num_bad_epochs >= patience:
                if verbose==1:
                    print(f"Stopping early at epoch {epoch}, loss: {val_loss}")
                break

    return model, val_loss.item()

In [7]:
def get_criterion(model, inputs_df, outputs_df, chosen_feature, f_engineering):
    prediction = model(get_input_tensor(inputs_df, chosen_feature, f_engineering)).detach().numpy().reshape(-1)
    min_log = outputs_df['min.log.lambda'].to_numpy()
    max_log = outputs_df['max.log.lambda'].to_numpy()
    min_margin = prediction - min_log
    min_margin[np.isinf(min_margin)] = 0
    max_margin = max_log - prediction
    max_margin[np.isinf(max_margin)] = 0
    criterion = np.sum([min(x, y) for x, y in zip(min_margin, max_margin)])
    return criterion

In [8]:
batch_size_candidates = [1, 100, 600]
margin_candidates = [0, 0.5, 1, 1.5]

def process_params(batch_size, margin):
    model, total_loss = mlp_training(inputs_fold1_df, outputs_fold1_df, 0, 0, ['length'], [loglog], 0, batch_size, margin, 500, 0)
    criterion = round(get_criterion(model, inputs_fold1_df, outputs_fold1_df, chosen_feature, f_engineering), 3)
    return [batch_size, margin, criterion, total_loss]

rows = Parallel(n_jobs=-1)(delayed(process_params)(batch_size, margin) 
                           for batch_size in batch_size_candidates 
                           for margin in margin_candidates)
df = pd.DataFrame(rows, columns=['batch_size', 'margin', 'criterion', 'total_loss'])
print(df.sort_values(by='criterion'))

    batch_size  margin  criterion  total_loss
3            1     1.5     -8.816    0.805206
2            1     1.0     -5.861    0.462787
0            1     0.0     -5.278    0.129165
1            1     0.5     -4.721    0.253927
7          100     1.5     -3.262    0.798821
4          100     0.0     -1.732    0.129008
6          100     1.0     -1.291    0.459517
5          100     0.5     -0.715    0.252645
9          600     0.5     -0.144    0.269373
8          600     0.0     -0.137    0.142875
10         600     1.0     -0.135    0.488784
11         600     1.5      0.052    0.829082


In [9]:
# batch_size_candidates = [1, 10, 200, 400]
# margin_candidates = [0, 1, 2]
# n_layer_candidates = [1, 2, 3]
# n_neurons_candidates = [4, 8, 16, 32]

# def process_params(batch_size, margin, n_layer, n_neurons):
#     model, total_loss = mlp_training(inputs_fold1_df, outputs_fold1_df, n_layer, n_neurons, ['length'], [loglog], 1, batch_size, margin, 500, 0)
#     criterion = round(get_criterion(model, inputs_fold1_df, outputs_fold1_df, chosen_feature, f_engineering), 3)
#     return [batch_size, margin, n_layer, n_neurons, criterion, total_loss]

# rows = Parallel(n_jobs=-1)(delayed(process_params)(batch_size, margin, n_layer, n_neurons) 
#                            for batch_size in batch_size_candidates 
#                            for margin in margin_candidates 
#                            for n_layer in n_layer_candidates 
#                            for n_neurons in n_neurons_candidates)
# df = pd.DataFrame(rows, columns=['batch_size', 'margin', 'n_layer', 'n_neurons', 'criterion', 'total_loss'])
# print(df.sort_values(by='criterion'))