In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from opart_functions import SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset
from opart_functions import get_acc_rate, get_err_df, gen_data_dict, tune_lldas

In [None]:
# training data
features_fold1_path = 'training_data/genome/seq_features.csv'
features_fold2_path = 'training_data/genome/seq_features.csv'  
target_fold1_path = 'training_data/genome/target_fold1.csv'
target_fold2_path = 'training_data/genome/target_fold2.csv'

# sequences and labels
seqs_path   = 'raw_data/genome/signals.csv'
labels_path = 'raw_data/genome/labels.csv'

# err for each log_lambda
err_fold1_path = 'training_data/genome/errors_fold1.csv'
err_fold2_path = 'training_data/genome/errors_fold2.csv'

# writing accuracy rate path
acc_rate_path = 'acc_rate/genome.csv'

# path to write df to csv
output_df_path = 'record_dataframe/genome/'

In [None]:
# generate sequence and label dictionary
seqs_dict   = gen_data_dict(seqs_path)
labels_dict = gen_data_dict(labels_path)

# getting dataframe of error count for each log_lambda
err_fold1_df = pd.read_csv(err_fold1_path)
err_fold2_df = pd.read_csv(err_fold2_path)

# features_df
features_df_fold1 = pd.read_csv(features_fold1_path)
features_df_fold2 = pd.read_csv(features_fold2_path)

# targets_df
target_df_fold1 = pd.read_csv(target_fold1_path)
target_df_fold2 = pd.read_csv(target_fold2_path)

In [None]:
targets_low_1  = torch.Tensor(target_df_fold1.iloc[:, 1:2].to_numpy())
targets_high_1 = torch.Tensor(target_df_fold1.iloc[:, 2:3].to_numpy())
targets_low_2  = torch.Tensor(target_df_fold2.iloc[:, 1:2].to_numpy())
targets_high_2 = torch.Tensor(target_df_fold2.iloc[:, 2:3].to_numpy())

target_fold1 = torch.cat((targets_low_1, targets_high_1), dim=1)
target_fold2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [None]:
class MLPModel(nn.Module):
    torch.manual_seed(123)
    def __init__(self, input_size, hidden_layers, hidden_size):
        super(MLPModel, self).__init__()
        self.input_size    = input_size
        self.hidden_layers = hidden_layers
        self.hidden_size   = hidden_size

        if(self.hidden_layers == 0):
            self.linear_model = nn.Linear(input_size, 1)                                                        # Define linear model
        else:
            self.input_layer = nn.Linear(input_size, hidden_size)                                               # Define input layer
            self.hidden = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers-1)])  # Define hidden layers
            self.output_layer = nn.Linear(hidden_size, 1)                                                       # Define output layer
        
    def forward(self, x):
        if(self.hidden_layers == 0):
            return self.linear_model(x)
        else:
            x = torch.relu(self.input_layer(x))
            for layer in self.hidden:
                x = torch.relu(layer(x))
            x = self.output_layer(x)
            return x

In [None]:
def investigate_model(input_size, hidden_layers, hidden_size, batch_size, feature, targets, test_fold, seqs_dict, labels_dict, err_df, n_ites=1):
    torch.manual_seed(123)
    # prepare training dataset
    dataset    = TensorDataset(feature, targets)
    dataloader = DataLoader(dataset, batch_size, shuffle=True)

    # Instantiate model, loss function and opimizer
    model = MLPModel(input_size, hidden_layers, hidden_size)
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters())

    # Training loop
    rates = []
    for i in range(n_ites + 1):
        total_loss = 0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    
        with torch.no_grad():
            lldas = model(feature).numpy().reshape(-1)
        lldas = tune_lldas(lldas)
        
        if(i%1 == 0):
            df = get_err_df(lldas, test_fold, seqs_dict, labels_dict, err_df)
            rate = get_acc_rate(df)
            rates.append(rate)
            print(i, total_loss/len(dataloader), rate)
    
    return rates

In [None]:
# feature
chosen_feature = ['std_deviation', 'length', 'sum_diff', 'range_value', 'abs_skewness']
X = features_df_fold1.iloc[:, 1:][chosen_feature].to_numpy()
X0 = X[:, 0]
X0 = np.log(X0).reshape(-1, 1)
X1 = X[:, 1]
X1 = np.log(np.log(X1)).reshape(-1, 1)
X2 = X[:, 2]
X2 = np.log(np.log(X2)).reshape(-1, 1)
X3 = X[:, 3]
X3 = np.log(X3).reshape(-1, 1)
X4 = X[:, 4]
X4 = np.log(X4).reshape(-1, 1)

X = np.concatenate([X0, X1, X2, X3, X4], axis=1)
mean = np.mean(X, axis=0)
std_dev = np.std(X, axis=0)
X = (X-mean)/std_dev
X = torch.Tensor(X)

In [None]:
# train_fold1, test_fold2
rates_fold2 = investigate_model(X.shape[1], 0, 0, 1, X, target_fold1, 2, seqs_dict, labels_dict, err_fold2_df, 100)

In [None]:
# train_fold2, test_fold1
rates_fold1 = investigate_model(X.shape[1], 0, 0, 1, X, target_fold2, 1, seqs_dict, labels_dict, err_fold1_df, 100)

In [None]:
# train_fold1, test_fold1, batch 1
rates_fold1_train = investigate_model(X.shape[1], 0, 0, 1, X, target_fold1, 1, seqs_dict, labels_dict, err_fold1_df, 100)

In [None]:
# train_fold1, test_fold1, batch 2
rates_fold1_train = investigate_model(X.shape[1], 0, 0, 2, X, target_fold1, 1, seqs_dict, labels_dict, err_fold1_df, 300)

In [None]:
# train_fold1, test_fold1, batch 3
rates_fold1_train = investigate_model(X.shape[1], 0, 0, 2, X, target_fold1, 1, seqs_dict, labels_dict, err_fold1_df, 300)

In [None]:
# train_fold1, test_fold1, batch 4
rates_fold1_train = investigate_model(X.shape[1], 0, 0, 4, X, target_fold1, 1, seqs_dict, labels_dict, err_fold1_df, 100)

In [None]:
# train_fold1, test_fold1, batch 8
rates_fold1_train = investigate_model(X.shape[1], 0, 0, 8, X, target_fold1, 1, seqs_dict, labels_dict, err_fold1_df, 200)

In [None]:
# train_fold2, test_fold2 batch 1
rates_fold2_train = investigate_model(X.shape[1], 0, 0, 1, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 100)

In [None]:
# train_fold2, test_fold2 batch 2
rates_fold2_train = investigate_model(X.shape[1], 0, 0, 2, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 100)

In [None]:
# train_fold2, test_fold2 batch 4
rates_fold2_train = investigate_model(X.shape[1], 0, 0, 4, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 100)

In [None]:
# train_fold2, test_fold2 batch 16
rates_fold2_train = investigate_model(X.shape[1], 0, 0, 16, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 100)