In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from utility_functions import SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset

from linear import LinearModel

In [2]:
# PATHs (edit these paths depending on dataset)
dataset = 'detailed'

# training data
fold_path = 'training_data/' + dataset + '/folds.csv'
inputs_path = 'training_data/' + dataset + '/inputs.csv'
outputs_path = 'training_data/' + dataset + '/outputs.csv'
evaluation_path = 'training_data/' + dataset + '/evaluation.csv'

# raw dfs
fold = 1
fold_df = pd.read_csv(fold_path)
inputs_df = pd.read_csv(inputs_path)
outputs_df = pd.read_csv(outputs_path)
evaluation_df = pd.read_csv(evaluation_path)

# fold dfs
inputs_fold1_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == 1]['sequenceID'])]
outputs_fold1_df = outputs_df[outputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == 1]['sequenceID'])]

# feature engineering transformation
identity = lambda x: x
log      = lambda x: np.log(x)
loglog   = lambda x: np.log(np.log(x))

In [3]:
chosen_feature = ['length']
f_engineering  = [loglog]

In [4]:
def get_input_tensor(inputs_df, chosen_feature, f_engineer):
    inputs = inputs_df[chosen_feature].to_numpy()
    for i in range(len(f_engineer)):
        inputs[:, i] = f_engineer[i](inputs[:, i])
    inputs = torch.Tensor(inputs)
    return inputs

In [5]:
def linear_training(inputs_df, outputs_df, chosen_feature, f_engineer, batch_size, margin, n_ites, verbose):
    # inputs
    inputs = inputs_df[chosen_feature].to_numpy()
    for i in range(len(f_engineer)):
        inputs[:, i] = f_engineer[i](inputs[:, i])
    inputs = torch.Tensor(inputs)

    # outputs:
    targets_low  = torch.Tensor(outputs_df['min.log.lambda'].to_numpy().reshape(-1,1))
    targets_high = torch.Tensor(outputs_df['max.log.lambda'].to_numpy().reshape(-1,1))
    outputs = torch.cat((targets_low, targets_high), dim=1)

    # prepare training dataset
    dataset    = TensorDataset(inputs, outputs)
    dataloader = DataLoader(dataset, batch_size, shuffle=False)

    # Instantiate model, loss function and optimizer
    model = LinearModel(inputs.shape[1])
    criterion = SquaredHingeLoss(margin)
    optimizer = optim.Adam(model.parameters())

    # Initialize early stopping parameters
    best_loss = float('inf')
    patience = 5  # Number of epochs to wait before early stopping
    num_bad_epochs = 0

    # Training loop
    for i in range(n_ites):
        for batch_input, batch_output in dataloader:
            optimizer.zero_grad()
            loss = criterion(model(batch_input), batch_output)
            loss.backward()
            optimizer.step()

        # Calculate validation loss
        val_loss = criterion(model(inputs), outputs)

        if verbose==1:
            print(f"{i}, loss: {val_loss}")

        # Check for early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            num_bad_epochs = 0
        else:
            num_bad_epochs += 1
            if num_bad_epochs >= patience:
                if verbose==1:
                    print(f"Stopping early at epoch {i}, loss: {val_loss}")
                break

    return model, val_loss.item()

In [6]:
def get_criterion(model, inputs_fold1_df, outputs_fold1_df, chosen_feature, f_engineering):
    prediction = model(get_input_tensor(inputs_fold1_df, chosen_feature, f_engineering)).detach().numpy()
    min_log = outputs_fold1_df['min.log.lambda'].to_numpy().reshape(-1,1)
    max_log = outputs_fold1_df['max.log.lambda'].to_numpy().reshape(-1,1)
    min_margin = prediction - min_log
    min_margin[np.isinf(min_margin)] = 0
    max_margin = max_log - prediction
    max_margin[np.isinf(max_margin)] = 0
    criterion = min_margin.mean() + max_margin.mean()
    return criterion

In [7]:
def plot_solution(model, batch_size):
    x = inputs_fold1_df[chosen_feature].to_numpy()
    x = loglog(x)
    y_start = outputs_fold1_df['min.log.lambda'].to_numpy()
    y_end = outputs_fold1_df['max.log.lambda'].to_numpy()

    # scatter
    plt.scatter(x, y_start, color='r',   s=2, label='min.log.lambda')
    plt.scatter(x, y_end,   color = 'b', s=2, label='max.log.lambda')

    # solution
    min_input = f_engineering[0](inputs_fold1_df[chosen_feature].min().item())
    max_input = f_engineering[0](inputs_fold1_df[chosen_feature].max().item())
    plt.plot([min_input, max_input], model(torch.Tensor([min_input, max_input]).reshape(-1, 1)).detach().numpy()[:,0])

    # Set labels and title
    plt.xlabel(chosen_feature[0])
    plt.ylabel('target')

    # legend
    plt.legend()

    # title
    plt.title('batch_size=' + str(batch_size) + "\ncriterion=" + str(round(get_criterion(model, inputs_fold1_df, outputs_fold1_df, chosen_feature, f_engineering), 2)))
    
    # Show plot
    plt.grid(True)
    plt.savefig('test_figure/' + str(batch_size) + '.png')

In [8]:
# HYPER
rows = []
for batch_size in [1, 10, 20, 40, 100, 200, 400]:
    for margin in [0, 1, 2]:
        model, total_loss = linear_training(inputs_fold1_df, outputs_fold1_df, ['length'], [loglog], batch_size, margin, 500, 0)
        criterion = round(get_criterion(model, inputs_fold1_df, outputs_fold1_df, chosen_feature, f_engineering), 3)
        row = [batch_size, margin, criterion, total_loss]
        rows.append(row)

In [9]:
df = pd.DataFrame(rows, columns=['batch_size', 'margin', 'criterion', 'total_loss'])
print(df.sort_values(by='criterion'))

    batch_size  margin  criterion  total_loss
16         200       1      2.074    0.465162
15         200       0      2.085    0.131569
13         100       1      2.134    0.459517
12         100       0      2.144    0.129008
10          40       1      2.173    0.458289
19         400       1      2.182    0.458272
7           20       1      2.183    0.458286
9           40       0      2.185    0.128354
4           10       1      2.188    0.458336
6           20       0      2.196    0.128333
18         400       0      2.197    0.128329
3           10       0      2.199    0.128340
17         200       2      2.238    1.352934
0            1       0      2.248    0.129165
1            1       1      2.263    0.462787
14         100       2      2.272    1.349849
11          40       2      2.294    1.349180
20         400       2      2.298    1.349169
8           20       2      2.299    1.349205
5           10       2      2.305    1.349325
2            1       2      2.386 