In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
from utility_functions import SquaredHingeLoss, get_acc, add_row_to_csv

In [2]:
# get common sequenceID
list1 = set(pd.read_csv("training_data/detailed/inputs.csv")['sequenceID'])
list2 = set(pd.read_csv("training_data/systematic/inputs_old.csv")['sequenceID'])
common_seqID_list = list1.intersection(list2)

In [3]:
# dataset
dataset = 'detailed'

In [4]:
# get dataframes
fold_split_df = pd.read_csv('training_data/' + dataset + '/folds.csv')
inputs_df     = pd.read_csv('training_data/' + dataset + '/inputs_old.csv')
outputs_df    = pd.read_csv('training_data/' + dataset + '/outputs.csv')
evaluation_df = pd.read_csv('training_data/' + dataset + '/evaluation.csv')

# filter with common_seqID_list
fold_split_df = fold_split_df[fold_split_df['sequenceID'].isin(common_seqID_list)]
inputs_df     = inputs_df[inputs_df['sequenceID'].isin(common_seqID_list)]
outputs_df    = outputs_df[outputs_df['sequenceID'].isin(common_seqID_list)]
evaluation_df = evaluation_df[evaluation_df['sequenceID'].isin(common_seqID_list)]

In [5]:
# number of folds
n_fold = fold_split_df['fold'].nunique()

In [6]:
# get df from test fold
def get_fold_dfs(test_fold, fold_split_df, inputs_df, outputs_df, evaluation_df):
    train_inputs_df  = inputs_df[inputs_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    train_outputs_df = outputs_df[outputs_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    train_eval_df    = evaluation_df[evaluation_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    test_inputs_df   = inputs_df[inputs_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    test_eval_df     = evaluation_df[evaluation_df['sequenceID'].isin(fold_split_df[fold_split_df['fold'] == test_fold]['sequenceID'])]
    return train_inputs_df.sort_values(by='sequenceID'), train_outputs_df.sort_values(by='sequenceID'), train_eval_df, test_inputs_df, test_eval_df

In [7]:
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_layers, hidden_size):
        super(MLPModel, self).__init__()
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.hidden_size = hidden_size
        
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers - 1)])
        self.output_layer = nn.Linear(hidden_size, 1)
        
        self.initialize_parameters()

    def initialize_parameters(self):
        for param in self.parameters():
            init.normal_(param, mean=0, std=2)

    def forward(self, x):
        x = torch.relu(self.input_layer(x))
        for layer in self.hidden:
            x = torch.relu(layer(x))
        x = self.output_layer(x)
        return x

In [8]:
# train model
def get_trained_model(inputs_tensor, outputs_tensor, hidden_layers, hidden_size):
    model = MLPModel(inputs_tensor.shape[1], hidden_layers, hidden_size)
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters())
    
    best_loss = float('inf')
    patience = 2000
    wait = 0
    for epoch in range(100000):
        loss = criterion(model(inputs_tensor), outputs_tensor)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Check if the loss has decreased
        if loss < best_loss:
            best_loss = loss
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break
    return model

In [9]:
cv_df = pd.read_csv('acc_rate_csvs/' + dataset + '_cv.csv')

In [10]:
def chose_best_architect(cv_df, test_fold, n_features):
    cv_df_fold_features = cv_df[(cv_df['fold'] == test_fold) & (cv_df['n_features'] == n_features)]
    cv_df_fold_features = cv_df_fold_features.sort_values(by='val_acc', ascending=False)
    best_n_layer = cv_df_fold_features.iloc[0]['n_layer']
    best_layer_size = cv_df_fold_features.iloc[0]['layer_size']
    return int(best_n_layer), int(best_layer_size)

In [11]:
def normalize(tensor):
    mean = torch.mean(tensor, dim=0)
    std = torch.std(tensor, dim=0)
    std_zeros = std == 0
    std[std_zeros] = 1
    normalized_tensor = (tensor - mean) / std.unsqueeze(0)
    return normalized_tensor

In [12]:
# main
for test_fold in range(1, n_fold+1):
    # fold dataframe
    train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df = get_fold_dfs(test_fold, fold_split_df, inputs_df, outputs_df, evaluation_df)

    # train_input_tensor
    train_input_tensor = train_inputs_df.iloc[:, 1:].to_numpy()
    train_input_tensor = torch.Tensor(train_input_tensor)
    train_input_tensor = normalize(train_input_tensor)

    # test_input_tensor
    test_input_tensor = test_inputs_df.iloc[:, 1:].to_numpy()
    test_input_tensor = torch.Tensor(test_input_tensor)
    test_input_tensor = normalize(test_input_tensor)

    # output tensor
    targets_low  = torch.Tensor(train_outputs_df['min.log.lambda'].to_numpy().reshape(-1,1))
    targets_high = torch.Tensor(train_outputs_df['max.log.lambda'].to_numpy().reshape(-1,1))
    outputs_tensor = torch.cat((targets_low, targets_high), dim=1)

    # trained model
    hidden_layers, hidden_size = chose_best_architect(cv_df, test_fold, 117)
    trained_model = get_trained_model(train_input_tensor, outputs_tensor, hidden_layers, hidden_size)
    
    # get acc
    with torch.no_grad():
        lldas = trained_model(test_input_tensor).numpy().reshape(-1)

    lldas_df = pd.DataFrame(list(zip(test_inputs_df['sequenceID'], lldas)), columns=['sequenceID', 'llda'])
    acc = get_acc(test_eval_df, lldas_df)
    add_row_to_csv('acc_rate_csvs/' + dataset + '.csv', 
                ['method', 'fold',     'feature engineer', 'acc'],
                ['mlp.117', test_fold, 'no',               acc])