In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import numpy as np
import pandas as pd
from opart_functions import SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset
from opart_functions import get_acc_rate, get_err_df, gen_data_dict, tune_lldas

In [2]:
# training data
features_fold1_path = 'training_data/genome/seq_features.csv'
features_fold2_path = 'training_data/genome/seq_features.csv'  
target_fold1_path = 'training_data/genome/target_fold1.csv'
target_fold2_path = 'training_data/genome/target_fold2.csv'

# sequences and labels
seqs_path   = 'raw_data/genome/signals.csv'
labels_path = 'raw_data/genome/labels.csv'

# err for each log_lambda
err_fold1_path = 'training_data/genome/errors_fold1.csv'
err_fold2_path = 'training_data/genome/errors_fold2.csv'

# writing accuracy rate path
acc_rate_path = 'acc_rate/genome.csv'

# path to write df to csv
output_df_path = 'record_dataframe/genome/'

In [3]:
# generate sequence and label dictionary
seqs_dict   = gen_data_dict(seqs_path)
labels_dict = gen_data_dict(labels_path)

# getting dataframe of error count for each log_lambda
err_fold1_df = pd.read_csv(err_fold1_path)
err_fold2_df = pd.read_csv(err_fold2_path)

# features_df
features_df_fold1 = pd.read_csv(features_fold1_path)
features_df_fold2 = pd.read_csv(features_fold2_path)

# targets_df
target_df_fold1 = pd.read_csv(target_fold1_path)
target_df_fold2 = pd.read_csv(target_fold2_path)

In [4]:
targets_low_1  = torch.Tensor(target_df_fold1.iloc[:, 1:2].to_numpy())
targets_high_1 = torch.Tensor(target_df_fold1.iloc[:, 2:3].to_numpy())
targets_low_2  = torch.Tensor(target_df_fold2.iloc[:, 1:2].to_numpy())
targets_high_2 = torch.Tensor(target_df_fold2.iloc[:, 2:3].to_numpy())

target_fold1 = torch.cat((targets_low_1, targets_high_1), dim=1)
target_fold2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [5]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_layers, hidden_size):
        super(MLPModel, self).__init__()
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.hidden_size = hidden_size

        if self.hidden_layers == 0:
            self.linear_model = nn.Linear(input_size, 1)
        else:
            self.input_layer = nn.Linear(input_size, hidden_size)
            self.hidden = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers - 1)])
            self.output_layer = nn.Linear(hidden_size, 1)
        
        self.initialize_parameters()

    def initialize_parameters(self):
        for param in self.parameters():
            init.constant_(param, 0.5)

    def forward(self, x):
        if self.hidden_layers == 0:
            return self.linear_model(x)
        else:
            x = torch.relu(self.input_layer(x))
            for layer in self.hidden:
                x = torch.relu(layer(x))
            x = self.output_layer(x)
            return x

In [6]:
def investigate_model(input_size, hidden_layers, hidden_size, batch_size, feature, targets, test_fold, seqs_dict, labels_dict, err_df, n_ites=1):
    torch.manual_seed(123)
    # prepare training dataset
    dataset    = TensorDataset(feature, targets)
    dataloader = DataLoader(dataset, batch_size)

    # Instantiate model, loss function and opimizer
    model = MLPModel(input_size, hidden_layers, hidden_size)
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters())

    # Training loop
    rates = []
    for i in range(n_ites + 1):
        total_loss = 0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    
        with torch.no_grad():
            lldas = model(feature).numpy().reshape(-1)
        lldas = tune_lldas(lldas)
        
        if(i%1 == 0):
            df = get_err_df(lldas, test_fold, seqs_dict, labels_dict, err_df)
            rate = get_acc_rate(df)
            rates.append(rate)
            print(i, total_loss/len(dataloader), rate)
    
    return rates

In [7]:
# feature
chosen_feature = ['std_deviation', 'length', 'sum_diff', 'range_value', 'abs_skewness']
X = features_df_fold1.iloc[:, 1:][chosen_feature].to_numpy()
X0 = X[:, 0]
X0 = np.log(X0).reshape(-1, 1)
X1 = X[:, 1]
X1 = np.log(np.log(X1)).reshape(-1, 1)
X2 = X[:, 2]
X2 = np.log(np.log(X2)).reshape(-1, 1)
X3 = X[:, 3]
X3 = np.log(X3).reshape(-1, 1)
X4 = X[:, 4]
X4 = np.log(X4).reshape(-1, 1)

X = np.concatenate([X0, X1, X2, X3, X4], axis=1)
mean = np.mean(X, axis=0)
std_dev = np.std(X, axis=0)
X = (X-mean)/std_dev
X = torch.Tensor(X)

In [8]:
# train_fold2, test_fold1
batch_size = 1
rates_fold1 = investigate_model(X.shape[1], 0, 0, batch_size, X, target_fold2, 1, seqs_dict, labels_dict, err_fold1_df, 20)

0 2.6106668896338965 70.34574468085107
1 1.750301941467994 75.93085106382979
2 1.4002043309624648 79.25531914893617
3 1.2817521505981664 80.71808510638297
4 1.2339965331524902 80.98404255319149
5 1.2152120441692391 80.71808510638297
6 1.2090856607559497 80.58510638297872
7 1.2063345377845227 80.71808510638297
8 1.205349632886509 80.58510638297872
9 1.2051810882988563 80.58510638297872
10 1.2045974557458707 80.85106382978724
11 1.2052642586791322 80.58510638297872
12 1.2045526792412777 80.71808510638297
13 1.2045702969357737 80.85106382978724
14 1.2052218727787816 80.71808510638297
15 1.2044591433496663 80.85106382978724
16 1.204451192008827 80.85106382978724
17 1.2050861298110944 80.71808510638297
18 1.2043105521130932 80.85106382978724
19 1.20429711878817 80.85106382978724
20 1.2042668101050515 80.71808510638297


In [9]:
# train_fold2, test_fold2 batch 1
batch_size = 1
rates_fold2_train = investigate_model(X.shape[1], 0, 0, batch_size, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 20)

0 2.6106668896338965 63.26923076923077
1 1.750301941467994 70.96153846153847
2 1.4002043309624648 76.34615384615384
3 1.2817521505981664 79.03846153846153
4 1.2339965331524902 79.8076923076923
5 1.2152120441692391 79.8076923076923
6 1.2090856607559497 79.03846153846153
7 1.2063345377845227 79.42307692307692
8 1.205349632886509 79.23076923076923
9 1.2051810882988563 79.23076923076923
10 1.2045974557458707 79.23076923076923
11 1.2052642586791322 79.23076923076923
12 1.2045526792412777 79.23076923076923
13 1.2045702969357737 79.23076923076923
14 1.2052218727787816 79.23076923076923
15 1.2044591433496663 79.23076923076923
16 1.204451192008827 79.23076923076923
17 1.2050861298110944 79.23076923076923
18 1.2043105521130932 79.23076923076923
19 1.20429711878817 79.23076923076923
20 1.2042668101050515 79.03846153846153


In [11]:
# train_fold2, test_fold1
batch_size = 1
rates_fold1 = investigate_model(X.shape[1], 2, 16, batch_size, X, target_fold2, 1, seqs_dict, labels_dict, err_fold1_df, 100)

0 4370.904777812034 60.37234042553192
1 1239.9685860243028 60.37234042553192
2 473.9986102781991 63.962765957446805
3 224.0980851924448 64.09574468085107
4 120.09973370889766 67.81914893617021
5 69.70814097883628 67.81914893617021
6 42.74087296585275 67.95212765957447
7 27.22994566407082 68.88297872340425
8 17.862534931212547 68.48404255319149
9 11.99405924945993 69.14893617021276
10 8.223657928785524 69.41489361702128
11 5.782185261167636 70.07978723404256
12 4.204014212908404 70.61170212765957
13 3.166317033022191 70.61170212765957
14 2.4843900448996217 71.67553191489361
15 2.052734138571475 72.34042553191489
16 1.781351990122844 73.80319148936171
17 1.6125317924815517 74.06914893617021
18 1.5051454062292862 75.0
19 1.4307364147073605 75.39893617021276
20 1.374279949881238 75.39893617021276
21 1.34680724921986 76.59574468085107
22 1.331515154997073 76.86170212765957
23 1.3211521869220886 76.72872340425532
24 1.3117199237325396 76.59574468085107
25 1.304582190834049 76.72872340425532


In [14]:
# train_fold2, test_fold2 batch 1
batch_size = 1
rates_fold2_train = investigate_model(X.shape[1], 2, 16, batch_size, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 50)

0 4370.904777812034 27.884615384615383
1 1239.9685860243028 27.884615384615383
2 473.9986102781991 39.03846153846154
3 224.0980851924448 40.19230769230769
4 120.09973370889766 56.34615384615385
5 69.70814097883628 57.30769230769231
6 42.74087296585275 58.26923076923077
7 27.22994566407082 61.53846153846154
8 17.862534931212547 62.5
9 11.99405924945993 63.26923076923077
10 8.223657928785524 64.42307692307692
11 5.782185261167636 66.15384615384616
12 4.204014212908404 66.92307692307692
13 3.166317033022191 68.26923076923077
14 2.4843900448996217 69.8076923076923
15 2.052734138571475 70.96153846153847
16 1.781351990122844 72.5
17 1.6125317924815517 74.03846153846153
18 1.5051454062292862 74.42307692307692
19 1.4307364147073605 75.76923076923077
20 1.374279949881238 76.34615384615384
21 1.34680724921986 75.96153846153847
22 1.331515154997073 76.15384615384616
23 1.3211521869220886 76.15384615384616
24 1.3117199237325396 75.96153846153847
25 1.304582190834049 76.34615384615384
26 1.29790156

In [15]:
# train_fold2, test_fold1
batch_size = 1
rates_fold1 = investigate_model(X.shape[1], 1, 8, batch_size, X, target_fold2, 1, seqs_dict, labels_dict, err_fold1_df, 100)

0 23.442506821442183 64.36170212765957
1 8.81400370057537 69.41489361702128
2 3.8433412979895505 71.27659574468085
3 2.1479744518407706 75.26595744680851
4 1.5500484322795034 77.52659574468085
5 1.3512789674070174 79.7872340425532
6 1.293065024993087 80.71808510638297
7 1.272657644738519 80.05319148936171
8 1.2648096946772802 79.65425531914893
9 1.2614717148514982 79.92021276595744
10 1.2578839894889269 79.52127659574468
11 1.2535935243305278 79.65425531914893
12 1.2491545511892215 79.92021276595744
13 1.2450693000788244 80.31914893617021
14 1.2407217425785815 80.31914893617021
15 1.2374924437831254 79.7872340425532
16 1.2338458440890132 79.65425531914893
17 1.2309909267133985 79.12234042553192
18 1.2283610465822898 78.45744680851064
19 1.2231128996743534 78.72340425531915
20 1.216316051281409 78.05851063829788
21 1.2152587148475351 78.32446808510639
22 1.2138637805396475 78.45744680851064
23 1.2131646958153839 78.45744680851064
24 1.2111834903506207 78.59042553191489
25 1.210697620425

In [16]:
# train_fold2, test_fold2 batch 1
batch_size = 1
rates_fold2_train = investigate_model(X.shape[1], 1, 8, batch_size, X, target_fold2, 2, seqs_dict, labels_dict, err_fold2_df, 50)

0 23.442506821442183 42.11538461538461
1 8.81400370057537 59.23076923076923
2 3.8433412979895505 63.84615384615385
3 2.1479744518407706 68.84615384615384
4 1.5500484322795034 72.11538461538461
5 1.3512789674070174 75.38461538461539
6 1.293065024993087 75.96153846153847
7 1.272657644738519 78.07692307692308
8 1.2648096946772802 78.26923076923077
9 1.2614717148514982 78.65384615384616
10 1.2578839894889269 78.46153846153847
11 1.2535935243305278 78.46153846153847
12 1.2491545511892215 78.84615384615384
13 1.2450693000788244 79.03846153846153
14 1.2407217425785815 79.03846153846153
15 1.2374924437831254 78.84615384615384
16 1.2338458440890132 78.65384615384616
17 1.2309909267133985 78.65384615384616
18 1.2283610465822898 79.03846153846153
19 1.2231128996743534 77.88461538461539
20 1.216316051281409 77.6923076923077
21 1.2152587148475351 77.88461538461539
22 1.2138637805396475 77.6923076923077
23 1.2131646958153839 78.07692307692308
24 1.2111834903506207 77.6923076923077
25 1.2106976204250