In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from opart_functions import SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset
from opart_functions import get_acc_rate, get_err_df, gen_data_dict, tune_lldas

In [None]:
# PATHs (edit these paths depending on dataset)
# training data
features_fold1_path = 'training_data/genome/seq_features.csv'
features_fold2_path = 'training_data/genome/seq_features.csv'  
target_fold1_path = 'training_data/genome/target_fold1.csv'
target_fold2_path = 'training_data/genome/target_fold2.csv'

# sequences and labels
seqs_path   = 'raw_data/genome/signals.csv'
labels_path = 'raw_data/genome/labels.csv'

# err for each log_lambda
err_fold1_path = 'training_data/genome/errors_fold1.csv'
err_fold2_path = 'training_data/genome/errors_fold2.csv'

# writing accuracy rate path
acc_rate_path = 'acc_rate/genome.csv'

# path to write df to csv
output_df_path = 'record_dataframe/genome/'

In [None]:
# define Linear model
class LinearModel(nn.Module):
    torch.manual_seed(123)
    def __init__(self, input_size=1):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)
    
    def forward(self, x):
        return self.linear(x)


In [None]:
def investigate_model(feature, targets, batch_size, test_fold, seqs_dict, labels_dict, err_df, n_ites=1):
    torch.manual_seed(123)
    # prepare training dataset
    dataset    = TensorDataset(feature, targets)
    dataloader = DataLoader(dataset, batch_size, shuffle=True)

    # Instantiate model, loss function and opimizer
    model = LinearModel()
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters())

    # Training loop
    rates = []
    for i in range(n_ites + 1):
        total_loss = 0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    
        with torch.no_grad():
            lldas = model(feature).numpy().reshape(-1)
        lldas = tune_lldas(lldas)
        
        if(i%10 == 0):
            df = get_err_df(lldas, test_fold, seqs_dict, labels_dict, err_df)
            rate = get_acc_rate(df)
            rates.append(rate)
            print(i, total_loss/len(dataloader), rate)
    
    return rates

In [None]:
# generate sequence and label dictionary
seqs_dict   = gen_data_dict(seqs_path)
labels_dict = gen_data_dict(labels_path)

# getting dataframe of error count for each log_lambda
err_fold1_df = pd.read_csv(err_fold1_path)
err_fold2_df = pd.read_csv(err_fold2_path)

# features_df
features_df_fold1 = pd.read_csv(features_fold1_path)
features_df_fold2 = pd.read_csv(features_fold2_path)

# targets_df
target_df_fold1 = pd.read_csv(target_fold1_path)
target_df_fold2 = pd.read_csv(target_fold2_path)

In [None]:
# feature
feature = features_df_fold2['length'].to_numpy()
feature = np.log10(np.log(feature)).reshape(-1,1)
feature = torch.Tensor(feature)

In [None]:
# target
targets_low_1  = torch.Tensor(target_df_fold1.iloc[:, 1:2].to_numpy())
targets_high_1 = torch.Tensor(target_df_fold1.iloc[:, 2:3].to_numpy())
targets_low_2  = torch.Tensor(target_df_fold2.iloc[:, 1:2].to_numpy())
targets_high_2 = torch.Tensor(target_df_fold2.iloc[:, 2:3].to_numpy())

target_fold1 = torch.cat((targets_low_1, targets_high_1), dim=1)
target_fold2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [None]:
# train fold1, test fold2, batch 1
rates_fold1 = investigate_model(feature, target_fold1, 1, 2, seqs_dict, labels_dict, err_fold2_df, 300)

In [None]:
# train fold2, test fold1, batch 1
rates_fold2 = investigate_model(feature, target_fold2, 1, 1, seqs_dict, labels_dict, err_fold1_df, 300)

In [None]:
# train fold1, test fold1, batch 1
rates_fold1_train = investigate_model(feature, target_fold1, 1, 1, seqs_dict, labels_dict, err_fold1_df, 500)

In [None]:
# train fold1, test fold1, batch 2
rates_fold1_train = investigate_model(feature, target_fold1, 2, 1, seqs_dict, labels_dict, err_fold1_df, 500)

In [None]:
# train fold2, test fold2, batch 1
rates_fold2_train = investigate_model(feature, target_fold2, 1, 2, seqs_dict, labels_dict, err_fold2_df, 500)

In [None]:
# train fold2, test fold2, batch 2
rates_fold2_train = investigate_model(feature, target_fold2, 2, 2, seqs_dict, labels_dict, err_fold2_df, 500)