In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from opart_functions import SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset
from opart_functions import get_acc_rate, get_err_df, gen_data_dict

torch.manual_seed(123)

<torch._C.Generator at 0x1a27e4c7c70>

In [2]:
# PATHs (edit these paths depending on dataset)
# training data
features_fold1_path = 'training_data/genome/seq_features.csv'
features_fold2_path = 'training_data/genome/seq_features.csv'  
target_fold1_path = 'training_data/genome/target_fold1.csv'
target_fold2_path = 'training_data/genome/target_fold2.csv'

# sequences and labels
seqs_path   = 'raw_data/genome/signals.csv'
labels_path = 'raw_data/genome/labels.csv'

# err for each log_lambda
err_fold1_path = 'training_data/genome/errors_fold1.csv'
err_fold2_path = 'training_data/genome/errors_fold2.csv'

# writing accuracy rate path
acc_rate_path = 'acc_rate/genome.csv'

# path to write df to csv
output_df_path = 'record_dataframe/genome/'

In [3]:
# tuning lldas
def tune_lldas(lldas):
    lldas = np.round(lldas*2)/2
    lldas[lldas > 5.0] = 5.0
    lldas[lldas < -5.0] = -5.0
    lldas[np.isclose(lldas, -0.0)] = 0.0
    return lldas

In [4]:
# Define the linear model
class LinearModel(nn.Module):
    def __init__(self, input_size=1):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)
    def forward(self, x):
        return self.linear(x)

In [5]:
def investigate_model(feature, targets, test_fold, seqs_dict, labels_dict, err_df, n_ites=1):
    # prepare training dataset
    dataset    = TensorDataset(feature, targets)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

    # Instantiate model, loss function and opimizer
    model = LinearModel()
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters(), 0.001)

    # Training loop
    rates = np.zeros(n_ites)
    for i in range(n_ites):
        total_loss = 0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    
        with torch.no_grad():
            lldas = model(feature).numpy().reshape(-1)
        lldas = tune_lldas(lldas)
        
        df = get_err_df(lldas, test_fold, seqs_dict, labels_dict, err_df)
        rate = get_acc_rate(df)
        rates[i] = rate

        if(i%10 == 0):
            print(i, total_loss/len(dataloader), rate)
    
    return rates

In [6]:
# generate sequence and label dictionary
seqs_dict   = gen_data_dict(seqs_path)
labels_dict = gen_data_dict(labels_path)

# getting dataframe of error count for each log_lambda
err_fold1_df = pd.read_csv(err_fold1_path)
err_fold2_df = pd.read_csv(err_fold2_path)

# features_df
features_df_fold1 = pd.read_csv(features_fold1_path)
features_df_fold2 = pd.read_csv(features_fold2_path)

# targets_df
target_df_fold1 = pd.read_csv(target_fold1_path)
target_df_fold2 = pd.read_csv(target_fold2_path)

In [7]:
# feature
feature = features_df_fold1['length'].to_numpy()
feature = np.log10(np.log(feature)).reshape(-1,1)
feature = torch.Tensor(feature)

# target
targets_low  = torch.Tensor(target_df_fold1.iloc[:, 1:2].to_numpy())
targets_high = torch.Tensor(target_df_fold1.iloc[:, 2:3].to_numpy())
targets = torch.cat((targets_low, targets_high), dim=1)

rates_fold1 = investigate_model(feature, targets, 1, seqs_dict, labels_dict, err_fold1_df, 600)

0 1.4558656744058793 73.40425531914893
10 1.1789587444899494 73.5372340425532
20 1.1724315304851567 73.5372340425532
30 1.1665644250990874 73.5372340425532
40 1.160835208608326 73.5372340425532
50 1.1561399145159232 73.5372340425532
60 1.1509236434153107 73.5372340425532
70 1.1462360385858361 73.5372340425532
80 1.1420744472802642 73.5372340425532
90 1.138689607403807 73.5372340425532
100 1.134138211605346 73.5372340425532
110 1.1307993332960262 73.80319148936171
120 1.127133716547244 74.06914893617021
130 1.1235664027566024 74.46808510638297
140 1.1205962569673813 74.46808510638297
150 1.1176178903424927 74.46808510638297
160 1.1143917660536153 74.60106382978724
170 1.1115625733401138 75.39893617021276
180 1.1085348319257755 76.06382978723404
190 1.1062683385446472 76.99468085106383
200 1.1038130673983801 77.26063829787235
210 1.1015885615374705 77.26063829787235
220 1.0992741227522236 77.26063829787235
230 1.0972639925682215 77.65957446808511
240 1.09529998222501 77.3936170212766
250

In [9]:
# feature
feature = features_df_fold2['length'].to_numpy()
feature = np.log10(np.log(feature)).reshape(-1,1)
feature = torch.Tensor(feature)

# target
targets_low  = torch.Tensor(target_df_fold2.iloc[:, 1:2].to_numpy())
targets_high = torch.Tensor(target_df_fold2.iloc[:, 2:3].to_numpy())
targets = torch.cat((targets_low, targets_high), dim=1)

rates_fold2 = investigate_model(feature, targets, 2, seqs_dict, labels_dict, err_fold2_df, 600)

0 2.1947634105996814 79.7872340425532
10 1.458996225005118 82.7127659574468
20 1.4504048722254934 82.7127659574468
30 1.4414159822404244 82.7127659574468
40 1.4323317488553318 83.24468085106383
50 1.4256494070560173 83.24468085106383
60 1.4186962546554898 83.37765957446808
70 1.4117151903189218 81.78191489361703
80 1.4054578807556233 80.98404255319149
90 1.3998278624736116 80.98404255319149
100 1.3938875863607556 80.85106382978724
110 1.3881591434181368 80.98404255319149
120 1.3838611328328052 81.25
130 1.3797946108050814 81.25
140 1.3761154726414748 81.11702127659575
150 1.3726494588484437 81.25
160 1.3695833095739656 81.38297872340425
170 1.3666615826824735 81.38297872340425
180 1.3632155270725024 81.38297872340425
190 1.36071176676106 81.51595744680851
200 1.3581257731679497 81.51595744680851
210 1.3557019466576736 81.51595744680851
220 1.3527316430893428 81.78191489361703
230 1.3511934355701822 81.78191489361703
240 1.3499093999340386 81.78191489361703
250 1.348667060972732 81.7819