In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from ipynb.fs.full.utility_functions import gen_data_dict, get_data, error_count, opart, SquaredHingeLoss, show_error_rate, write_to_csv

np.random.seed(4)
torch.manual_seed(4)

<torch._C.Generator at 0x280ca8afcf0>

In [2]:
def normalize_data(data):
    mean = data.mean(dim=0)
    std  = data.std(dim=0)
    norm_data = (data - mean) / std
    return norm_data

In [3]:
# data (inputs)
chosen_feature = ['std_deviation', 'count', 'sum_diff', 'range_value', 'abs_skewness']
data = torch.FloatTensor(pd.read_csv('1_genome/1_training_data/seq_features.csv').iloc[:, 1:][chosen_feature].to_numpy())
X = normalize_data(data)

In [4]:
# data (targets)
target_df_1 = pd.read_csv('1_genome/1_training_data/target_lambda_fold1_base_e.csv')
target_df_2 = pd.read_csv('1_genome/1_training_data/target_lambda_fold2_base_e.csv')

targets_low_1  = torch.FloatTensor(target_df_1.iloc[:, 1:2].to_numpy())
targets_high_1 = torch.FloatTensor(target_df_1.iloc[:, 2:3].to_numpy())
targets_low_2  = torch.FloatTensor(target_df_2.iloc[:, 1:2].to_numpy())
targets_high_2 = torch.FloatTensor(target_df_2.iloc[:, 2:3].to_numpy())

y1 = torch.cat((targets_low_1, targets_high_1), dim=1)
y2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [5]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_layers, hidden_size):
        super(MyModel, self).__init__()
        self.hidden_layers = hidden_layers
        self.hidden_size = hidden_size
        
        # Define input layer
        self.input_layer = nn.Linear(input_size, hidden_size)
        
        # Define hidden layers
        self.hidden = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers)])
        
        # Define output layer
        self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Forward pass through input layer
        x = torch.relu(self.input_layer(x))
        
        # Forward pass through hidden layers
        for layer in self.hidden:
            x = torch.relu(layer(x))
        
        # Forward pass through output layer
        x = self.output_layer(x)
        return x
    
    def __str__(self):
        # Print the architecture of the model
        return super(MyModel, self).__str__() + '\n\n' + \
            'Hidden Layers: {}\nHidden Size: {}\n'.format(self.hidden_layers, self.hidden_size)

In [6]:
def cv_learn(X, y1, y2, n_hiddens, layer_size):
    # Define the number of folds for cross-validation
    num_folds = 3
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=4)

    # loss function
    squared_hinge_loss = SquaredHingeLoss()

    best_ites_1 = []
    best_ites_2 = []
    for train_index, val_index in kf.split(X):

        # Split the data into training and validation sets
        X_train_tensor, X_val_tensor   = X[train_index],  X[val_index]
        y1_train_tensor, y1_val_tensor = y1[train_index], y1[val_index]
        y2_train_tensor, y2_val_tensor = y2[train_index], y2[val_index]

        # Define your model, loss, and optimizer
        model1 = MyModel(input_size=X.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
        model2 = MyModel(input_size=X.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)

        optimizer1 = optim.Adam(model1.parameters(), lr=0.001, amsgrad=True)
        optimizer2 = optim.Adam(model2.parameters(), lr=0.001, amsgrad=True)

        # Training loop for the specified number of iterations
        val_losses_1 = []
        val_losses_2 = []
        for epoch in range(2000):
            outputs1 = model1(X_train_tensor)
            outputs2 = model2(X_train_tensor)

            loss1 = squared_hinge_loss(outputs1, y1_train_tensor)
            loss2 = squared_hinge_loss(outputs2, y2_train_tensor)
            
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            
            loss1.backward()
            loss2.backward()

            optimizer1.step()
            optimizer2.step()

            # add val loss
            val_loss_1 = squared_hinge_loss(model1(X_val_tensor), y1_val_tensor)
            val_loss_2 = squared_hinge_loss(model2(X_val_tensor), y2_val_tensor)

            val_losses_1.append(val_loss_1.item())
            val_losses_2.append(val_loss_2.item())
        
        best_ite_1 = np.argmin(val_losses_1) + 1
        best_ite_2 = np.argmin(val_losses_2) + 1

        best_ites_1.append(best_ite_1)
        best_ites_2.append(best_ite_2)

    best_no_ite_1 = int(np.mean(best_ites_1))
    best_no_ite_2 = int(np.mean(best_ites_2))

    return best_no_ite_1, best_no_ite_2

In [7]:
def train_model(X, y, n_hiddens, layer_size, n_ites):
    model = MyModel(input_size=X.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
    squared_hinge_loss = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)

    # Training loop
    for _ in range(n_ites):
        # Forward pass
        outputs = model(X)
        
        # Compute the custom loss
        loss = squared_hinge_loss(outputs, y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return model

In [8]:
def get_acc_rate(ldas1, ldas2, seqs, labels):
    header = ['sequenceID', 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_fp_errs', 'fold_1_fn_errs', 'fold_1_tp', 'fold_1_tn', 'fold_2_fp_errs', 'fold_2_fn_errs', 'fold_2_tp', 'fold_2_tn']
    rows = []
    for i in range(len(seqs)):
        
        # generate data
        sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)

        # get total labels
        fold1_total_labels = len(neg_start_1) + len(pos_start_1)
        fold2_total_labels = len(neg_start_2) + len(pos_start_2)

        # run each lambda and record it into csv file
        row  = [i, fold1_total_labels, fold2_total_labels]

        chpnt_fold1 = opart(np.e**ldas2[i], sequence)
        chpnt_fold2 = opart(np.e**ldas1[i], sequence)

        err_1 = error_count(chpnt_fold1, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
        err_2 = error_count(chpnt_fold2, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
        
        for err in [err_1, err_2]:
            row.extend(err[:4])

        rows.append(row)
    
    df = pd.DataFrame(rows, columns=header)
    return show_error_rate(df)

In [9]:
def try_model(config):
    n_hiddens  = config['n_hiddens']
    layer_size = config['layer_size']

    best_no_ite_1, best_no_ite_2 = cv_learn(X, y1, y2, n_hiddens, layer_size)

    model1 = train_model(X, y1, n_hiddens, layer_size, best_no_ite_1)
    model2 = train_model(X, y2, n_hiddens, layer_size, best_no_ite_2)

    torch.save(model1.state_dict(), '1_genome/3_learned_models/2_deep/model1_' + str(n_hiddens) + '_' + str(layer_size) + '.pth')
    torch.save(model2.state_dict(), '1_genome/3_learned_models/2_deep/model2_' + str(n_hiddens) + '_' + str(layer_size) + '.pth')

    with torch.no_grad():
        ldas1 = model1(X).numpy().reshape(-1)
        ldas2 = model2(X).numpy().reshape(-1)

    seqs   = gen_data_dict('1_genome/0_sequences_labels/signals.gz')
    labels = gen_data_dict('1_genome/0_sequences_labels/labels.gz')
    rate1, rate2, fold1_total_labels, fold2_total_labels, fold1_total_errs, fold2_total_errs = get_acc_rate(ldas1, ldas2, seqs, labels)

    return (rate1 + rate2)/2

In [10]:
# Define the values for n_hiddens and layer_size
n_hiddens_values  = [1, 2, 3]
layer_size_values = [8, 16, 32]

# Create a list of dictionaries
configs = [{'n_hiddens': n, 'layer_size': s} for n in n_hiddens_values for s in layer_size_values]

In [11]:
# choose best model
accs = Parallel(n_jobs=32)(delayed(try_model)(configs[i]) for i in range(0, len(configs)))
best_model = configs[np.argmax(accs)]

In [23]:
print(np.max(accs))

68.68453355155484


In [21]:
print(best_model)

{'n_hiddens': 3, 'layer_size': 8}


In [12]:
# # Instantiate the model, define custom loss function, and optimizer
# model1 = MyModel(input_size = data.shape[1])
# model2 = MyModel(input_size = data.shape[1])

# squared_hinge_loss = SquaredHingeLoss()
# optimizer1 = optim.Adam(model1.parameters(), lr=0.001, amsgrad=True)
# optimizer2 = optim.Adam(model2.parameters(), lr=0.001, amsgrad=True)

# # Training loop
# record_loss1_test = []
# record_loss2_test = []
# min_loss_1_test = float('inf')
# min_loss_2_test = float('inf')
# for epoch in range(1000):
#     # Forward pass
#     outputs1 = model1(data)
#     outputs2 = model2(data)
    
#     # Compute the custom loss
#     loss_1 = squared_hinge_loss(outputs1, targets_low_1, targets_high_1)
#     loss_2 = squared_hinge_loss(outputs2, targets_low_2, targets_high_2)
#     loss_1_test = squared_hinge_loss(outputs1, targets_low_2, targets_high_2)
#     loss_2_test = squared_hinge_loss(outputs2, targets_low_1, targets_high_1)
    
#     # Backward pass and optimization
#     optimizer1.zero_grad()
#     loss_1.backward()
#     optimizer1.step()

#     optimizer2.zero_grad()
#     loss_2.backward()
#     optimizer2.step()

#     # record
#     record_loss1_test.append(loss_1_test.item())
#     record_loss2_test.append(loss_2_test.item())

#     # save models
#     if loss_1_test < min_loss_1_test:
#         min_loss_1_test = loss_1_test
#         torch.save(model1.state_dict(), '1.genome_saved_models/model1_dl_best.pth')
    
#     if loss_2_test < min_loss_2_test:
#         min_loss_2_test = loss_2_test
#         torch.save(model2.state_dict(), '1.genome_saved_models/model2_dl_best.pth')
    
#     # Print the loss every 100 epochs
#     if (epoch) % 100 == 0:
#         print(f'Epoch {epoch:5d}, Loss_1: {loss_1.item():8.4f}, Loss_1_test: {loss_1_test.item():8.4f}, Loss_2: {loss_2.item():8.4f}, Loss_2_test: {loss_2_test.item():8.4f}')

In [13]:
n_hiddens  = best_model['n_hiddens']
layer_size = best_model['layer_size']

In [14]:
# Load model1
model1 = MyModel(input_size=X.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
model1.load_state_dict(torch.load('1_genome/3_learned_models/2_deep/model1_' + str(n_hiddens) + '_' + str(layer_size) + '.pth'))
model1.eval()  # Set the model to evaluation mode

MyModel(
  (input_layer): Linear(in_features=5, out_features=8, bias=True)
  (hidden): ModuleList(
    (0-2): 3 x Linear(in_features=8, out_features=8, bias=True)
  )
  (output_layer): Linear(in_features=8, out_features=1, bias=True)
)

In [15]:
# Load model2
model2 = MyModel(input_size=X.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
model2.load_state_dict(torch.load('1_genome/3_learned_models/2_deep/model2_' + str(n_hiddens) + '_' + str(layer_size) + '.pth'))
model2.eval()  # Set the model to evaluation mode

MyModel(
  (input_layer): Linear(in_features=5, out_features=8, bias=True)
  (hidden): ModuleList(
    (0-2): 3 x Linear(in_features=8, out_features=8, bias=True)
  )
  (output_layer): Linear(in_features=8, out_features=1, bias=True)
)

In [16]:
with torch.no_grad():
    ldas1 = model1(data).numpy().reshape(-1)
    ldas2 = model2(data).numpy().reshape(-1)

In [17]:
seqs   = gen_data_dict('1_genome/0_sequences_labels/signals.gz')
labels = gen_data_dict('1_genome/0_sequences_labels/labels.gz')

header = ['sequenceID', 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_fp_errs', 'fold_1_fn_errs', 'fold_1_tp', 'fold_1_tn', 'fold_2_fp_errs', 'fold_2_fn_errs', 'fold_2_tp', 'fold_2_tn']
rows = []
for i in range(len(seqs)):
    # generate data
    sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)
    sequence_length = len(sequence)-1

    # get total labels
    fold1_total_labels = len(neg_start_1) + len(pos_start_1)
    fold2_total_labels = len(neg_start_2) + len(pos_start_2)

    # run each lambda and record it into csv file
    row  = [seqs[i][0], fold1_total_labels, fold2_total_labels]

    chpnt_fold1 = opart(np.e**ldas2[i], sequence)
    chpnt_fold2 = opart(np.e**ldas1[i], sequence)

    err_1 = error_count(chpnt_fold1, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
    err_2 = error_count(chpnt_fold2, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
    
    # row.append(sum(err_1))
    # row.append(sum(err_2))
    for err in [err_1, err_2]:
        row.extend(err[:4])

    # write_to_csv('1_genome/2_learning_record/deep.csv', header, row)
    rows.append(row)

df = pd.DataFrame(rows, columns=header)
df.to_csv('1_genome/2_learning_record/deep.csv', index=False)

