In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipynb.fs.full.utility_functions import gen_data_dict, get_data, SquaredHingeLoss
from torch.utils.data import DataLoader, TensorDataset

np.set_printoptions(precision=3)
np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x1b5ff5bf390>

In [2]:
# data features
data = pd.read_csv('1_training_data/seq_features.csv')['count'].to_numpy()
data = np.log10(np.log(data)).reshape(-1,1)
X = torch.FloatTensor(data)

In [3]:
# target
target_df_1 = pd.read_csv('1_training_data/target_lambda_fold1_base_10.csv')
target_df_2 = pd.read_csv('1_training_data/target_lambda_fold2_base_10.csv')

targets_low_1  = torch.FloatTensor(target_df_1.iloc[:, 1:2].to_numpy())
targets_high_1 = torch.FloatTensor(target_df_1.iloc[:, 2:3].to_numpy())
targets_low_2  = torch.FloatTensor(target_df_2.iloc[:, 1:2].to_numpy())
targets_high_2 = torch.FloatTensor(target_df_2.iloc[:, 2:3].to_numpy())

y1 = torch.cat((targets_low_1, targets_high_1), dim=1)
y2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [4]:
# Define the linear model
class LinearModel(nn.Module):
    def __init__(self, input_size=1):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

In [5]:
def train_model(X, y, lr, epochs):
    # Create DataLoader
    dataset    = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

    # Instantiate model and loss function
    model = LinearModel()
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters(), lr)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if (epoch+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.3f}')
    return model

In [6]:
def get_df_stat(ldas1, ldas2, err_fold1_df, err_fold2_df, seqs, labels):
    header = ['sequenceID', 'lda_fold1', 'lda_fold2', 'fold_1_total_labels', 'fold_2_total_labels', 'fold1_err', 'fold2_err']
    rows = []
    for i in range(len(seqs)):
        # get total labels
        _, neg_start_1, _, pos_start_1, _, neg_start_2, _, pos_start_2, _ = get_data(i, seqs, labels)
        fold1_total_labels = len(neg_start_1) + len(pos_start_1)
        fold2_total_labels = len(neg_start_2) + len(pos_start_2)

        # round lambda
        ldas1 = [round(num*2)/2 for num in ldas1]
        ldas2 = [round(num*2)/2 for num in ldas2]

        # get err
        fold1_err = err_fold1_df.iloc[i][str(ldas1[i])]
        fold2_err = err_fold2_df.iloc[i][str(ldas2[i])]
        
        # add row to rows
        row = [seqs[i][0], ldas1[i], ldas2[i], fold1_total_labels, fold2_total_labels, fold1_err, fold2_err]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=header)
    return df

In [7]:
def try_model(full_X, X1, X2, y1, y2, err_fold1_df, err_fold2_df, seqs, labels):
    model1 = train_model(X1, y1, 0.001, 500)
    model2 = train_model(X2, y2, 0.001, 500)

    with torch.no_grad():
        ldas1 = model1(full_X).numpy().reshape(-1)
        ldas2 = model2(full_X).numpy().reshape(-1)

    df = get_df_stat(ldas1, ldas2, err_fold1_df, err_fold2_df, seqs, labels)
    return df

In [8]:
# getting dataframe of error count
err_fold1_df = pd.read_csv('1_training_data/errors_fold1_base_10.csv')
err_fold2_df = pd.read_csv('1_training_data/errors_fold2_base_10.csv')

# getting sequences and labels data
seqs   = gen_data_dict('0_sequences_labels/signals.gz')
labels = gen_data_dict('0_sequences_labels/labels.gz')

In [9]:
df = try_model(X, X, X, y1, y2, err_fold1_df, err_fold2_df, seqs, labels)

Epoch [100/600], Loss: 1.135
Epoch [200/600], Loss: 1.104
Epoch [300/600], Loss: 1.084
Epoch [400/600], Loss: 1.073
Epoch [500/600], Loss: 1.071
Epoch [600/600], Loss: 1.068
Epoch [100/600], Loss: 1.394
Epoch [200/600], Loss: 1.358
Epoch [300/600], Loss: 1.342
Epoch [400/600], Loss: 1.337
Epoch [500/600], Loss: 1.334
Epoch [600/600], Loss: 1.333


In [10]:
total_label_fold1 = df['fold_1_total_labels'].sum()
total_label_fold2 = df['fold_2_total_labels'].sum()
err1 = df['fold1_err'].sum()
err2 = df['fold2_err'].sum()
rate1 = (total_label_fold1 - err1)/total_label_fold1
rate2 = (total_label_fold2 - err2)/total_label_fold2

print("fold1.test: %5.2f \t fold2.test: %5.2f" % (rate1*100, rate2*100))

fold1.test: 78.19 	 fold2.test: 75.58
