In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from utility_functions import SquaredHingeLoss, get_acc, add_row_to_csv

In [2]:
# dataset
dataset = 'systematic'

In [3]:
# get dataframes
fold_split_df = pd.read_csv('training_data/' + dataset + '/folds.csv')
inputs_df     = pd.read_csv('training_data/' + dataset + '/inputs.csv')
outputs_df    = pd.read_csv('training_data/' + dataset + '/outputs.csv')
evaluation_df = pd.read_csv('training_data/' + dataset + '/evaluation.csv')

In [4]:
# number of folds
n_fold = fold_split_df['fold'].nunique()

In [5]:
# feature engineering transformation
identity = lambda x: x
log      = lambda x: np.log(x)
loglog   = lambda x: np.log(np.log(x))

In [6]:
# get df from test fold
def get_fold_dfs(test_fold, fold_split_df, inputs_df, outputs_df, evaluation_df):
    train_inputs_df  = inputs_df[inputs_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] != test_fold]['sequenceID'])]
    train_outputs_df = outputs_df[outputs_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] != test_fold]['sequenceID'])]
    train_eval_df    = evaluation_df[evaluation_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] != test_fold]['sequenceID'])]
    test_inputs_df   = inputs_df[inputs_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    test_eval_df     = evaluation_df[evaluation_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    return train_inputs_df.sort_values(by='sequenceID'), train_outputs_df.sort_values(by='sequenceID'), train_eval_df, test_inputs_df, test_eval_df

In [7]:
# Define the linear model
class LinearModel(nn.Module):
    def __init__(self, input_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        
    def forward(self, x):
        return self.linear(x)

In [8]:
# train model
def get_trained_model(inputs_tensor, outputs_tensor):
    model = LinearModel(inputs_tensor.shape[1])
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters())
    
    best_loss = float('inf')
    patience = 500
    wait = 0
    for epoch in range(10000):
        loss = criterion(model(inputs_tensor), outputs_tensor)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Check if the loss has decreased
        if loss < best_loss:
            best_loss = loss
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break
    return model

In [9]:
# main
for test_fold in range(1, n_fold+1):
    for chosen_features, feature_engineer in zip([['length'], ['length', 'sd'], ['length', 'sd', 'range_value', 'sum_diff']],
                                                 [[loglog],   [loglog,   log],  [loglog, log, log, log]]):
        # fold dataframe
        train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df = get_fold_dfs(test_fold, fold_split_df, inputs_df, outputs_df, evaluation_df)

        # train_input_tensor
        train_input_tensor = train_inputs_df[chosen_features].to_numpy()
        for i in range(len(feature_engineer)):
            train_input_tensor[:, i] = feature_engineer[i](train_input_tensor[:, i])
        train_input_tensor = torch.Tensor(train_input_tensor)

        # test_input_tensor
        test_input_tensor = test_inputs_df[chosen_features].to_numpy()
        for i in range(len(feature_engineer)):
            test_input_tensor[:, i] = feature_engineer[i](test_input_tensor[:, i])
        test_input_tensor = torch.Tensor(test_input_tensor)

        # output tensor
        targets_low  = torch.Tensor(train_outputs_df['min.log.lambda'].to_numpy().reshape(-1,1))
        targets_high = torch.Tensor(train_outputs_df['max.log.lambda'].to_numpy().reshape(-1,1))
        outputs_tensor = torch.cat((targets_low, targets_high), dim=1)

        # trained model
        trained_model = get_trained_model(train_input_tensor, outputs_tensor)
        
        # get acc
        with torch.no_grad():
            lldas = trained_model(test_input_tensor).numpy().reshape(-1)

        lldas_df = pd.DataFrame(list(zip(test_inputs_df['sequenceID'], lldas)), columns=['sequenceID', 'llda'])
        acc = get_acc(test_eval_df, lldas_df)
        add_row_to_csv('acc_rate_csvs/' + dataset + '.csv', 
                    ['method', 'fold', 'feature engineer', 'acc'], 
                    ['linear.'+str(len(chosen_features)), test_fold, 'yes', acc])