In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from ipynb.fs.full.utility_functions import gen_data_dict, get_data, get_cumsum, error_count, write_to_csv, opart, SquaredHingeLoss

np.random.seed(4)
torch.manual_seed(4)

<torch._C.Generator at 0x27264cffcf0>

In [2]:
def normalize_data(data):
    mean = data.mean(dim=0)
    std  = data.std(dim=0)
    norm_data = (data - mean) / std
    return norm_data

In [3]:
# data
chosen_feature = ['std_deviation', 'count', 'sum_diff', 'range_value', 'abs_skewness']
data = torch.FloatTensor(pd.read_csv('1.genome_learning_data/seq_features.csv').iloc[:, 1:][chosen_feature].to_numpy())
X = normalize_data(data)

target_df_1 = pd.read_csv('1.genome_learning_data/target_lambda_fold1.csv')
target_df_2 = pd.read_csv('1.genome_learning_data/target_lambda_fold2.csv')

targets_low_1  = target_df_1.iloc[:, 1:2].to_numpy()
targets_high_1 = target_df_1.iloc[:, 2:3].to_numpy()
targets_low_2  = target_df_2.iloc[:, 1:2].to_numpy()
targets_high_2 = target_df_2.iloc[:, 2:3].to_numpy()

targets_low_1  = torch.FloatTensor(targets_low_1)
targets_high_1 = torch.FloatTensor(targets_high_1)
targets_low_2  = torch.FloatTensor(targets_low_2)
targets_high_2 = torch.FloatTensor(targets_high_2)

y1 = torch.cat((targets_low_1, targets_high_1), dim=1)
y2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [4]:
# Define the deep learning model
class MyModel(nn.Module):
    def __init__(self, input_size):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 8)
        self.fc2 = nn.Linear(8, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
# Cross-Validation number of iterations

# Define the number of folds for cross-validation
num_folds = 3
kf = KFold(n_splits=num_folds, shuffle=True, random_state=4)

# loss function
squared_hinge_loss = SquaredHingeLoss()

best_ites_1 = []
best_ites_2 = []
for train_index, val_index in kf.split(X):

    # Split the data into training and validation sets
    X_train_tensor, X_val_tensor   = X[train_index],  X[val_index]
    y1_train_tensor, y1_val_tensor = y1[train_index], y1[val_index]
    y2_train_tensor, y2_val_tensor = y2[train_index], y2[val_index]

    # Define your model, loss, and optimizer
    model1 = MyModel(input_size=X.shape[1])
    model2 = MyModel(input_size=X.shape[1])

    optimizer1 = optim.Adam(model1.parameters(), lr=0.001, amsgrad=True)
    optimizer2 = optim.Adam(model2.parameters(), lr=0.001, amsgrad=True)

    # Training loop for the specified number of iterations
    val_losses_1 = []
    val_losses_2 = []
    for epoch in range(1000):
        outputs1 = model1(X_train_tensor)
        outputs2 = model2(X_train_tensor)

        loss1 = squared_hinge_loss(outputs1, y1_train_tensor)
        loss2 = squared_hinge_loss(outputs2, y2_train_tensor)
        
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        
        loss1.backward()
        loss2.backward()

        optimizer1.step()
        optimizer2.step()

        # add val loss
        val_loss_1 = squared_hinge_loss(model1(X_val_tensor), y1_val_tensor)
        val_loss_2 = squared_hinge_loss(model2(X_val_tensor), y2_val_tensor)

        val_losses_1.append(val_loss_1.item())
        val_losses_2.append(val_loss_2.item())
    
    best_ite_1 = np.argmin(val_losses_1) + 1
    best_ite_2 = np.argmin(val_losses_2) + 1

    best_ites_1.append(best_ite_1)
    best_ites_2.append(best_ite_2)

best_no_ite_1 = int(np.mean(best_ites_1))
best_no_ite_2 = int(np.mean(best_ites_2))

In [6]:
# Train fold 1
model1 = MyModel(input_size = data.shape[1])
squared_hinge_loss = SquaredHingeLoss()
optimizer1 = optim.Adam(model1.parameters(), lr=0.001, amsgrad=True)

# Training loop
for epoch in range(best_no_ite_1):
    # Forward pass
    outputs1 = model1(data)
    
    # Compute the custom loss
    loss_1 = squared_hinge_loss(outputs1, y1)
    
    # Backward pass and optimization
    optimizer1.zero_grad()
    loss_1.backward()
    optimizer1.step()

# save model
torch.save(model1.state_dict(), '1.genome_saved_models/model1_dl_best.pth')

In [7]:
# Train fold 2
model2 = MyModel(input_size = data.shape[1])
squared_hinge_loss = SquaredHingeLoss()
optimizer2 = optim.Adam(model2.parameters(), lr=0.001, amsgrad=True)

# Training loop
for epoch in range(best_no_ite_2):
    # Forward pass
    outputs2 = model2(data)
    
    # Compute the custom loss
    loss_2 = squared_hinge_loss(outputs2, y2)
    
    # Backward pass and optimization
    optimizer2.zero_grad()
    loss_2.backward()
    optimizer2.step()

# save model
torch.save(model1.state_dict(), '1.genome_saved_models/model2_dl_best.pth')

In [8]:
# # Instantiate the model, define custom loss function, and optimizer
# model1 = MyModel(input_size = data.shape[1])
# model2 = MyModel(input_size = data.shape[1])

# squared_hinge_loss = SquaredHingeLoss()
# optimizer1 = optim.Adam(model1.parameters(), lr=0.001, amsgrad=True)
# optimizer2 = optim.Adam(model2.parameters(), lr=0.001, amsgrad=True)

# # Training loop
# record_loss1_test = []
# record_loss2_test = []
# min_loss_1_test = float('inf')
# min_loss_2_test = float('inf')
# for epoch in range(1000):
#     # Forward pass
#     outputs1 = model1(data)
#     outputs2 = model2(data)
    
#     # Compute the custom loss
#     loss_1 = squared_hinge_loss(outputs1, targets_low_1, targets_high_1)
#     loss_2 = squared_hinge_loss(outputs2, targets_low_2, targets_high_2)
#     loss_1_test = squared_hinge_loss(outputs1, targets_low_2, targets_high_2)
#     loss_2_test = squared_hinge_loss(outputs2, targets_low_1, targets_high_1)
    
#     # Backward pass and optimization
#     optimizer1.zero_grad()
#     loss_1.backward()
#     optimizer1.step()

#     optimizer2.zero_grad()
#     loss_2.backward()
#     optimizer2.step()

#     # record
#     record_loss1_test.append(loss_1_test.item())
#     record_loss2_test.append(loss_2_test.item())

#     # save models
#     if loss_1_test < min_loss_1_test:
#         min_loss_1_test = loss_1_test
#         torch.save(model1.state_dict(), '1.genome_saved_models/model1_dl_best.pth')
    
#     if loss_2_test < min_loss_2_test:
#         min_loss_2_test = loss_2_test
#         torch.save(model2.state_dict(), '1.genome_saved_models/model2_dl_best.pth')
    
#     # Print the loss every 100 epochs
#     if (epoch) % 100 == 0:
#         print(f'Epoch {epoch:5d}, Loss_1: {loss_1.item():8.4f}, Loss_1_test: {loss_1_test.item():8.4f}, Loss_2: {loss_2.item():8.4f}, Loss_2_test: {loss_2_test.item():8.4f}')

In [10]:
# Load model1
model1 = MyModel(input_size=data.shape[1])
model1.load_state_dict(torch.load('1.genome_saved_models/model1_dl_best.pth'))
model1.eval()  # Set the model to evaluation mode

# Load model2
model2 = MyModel(input_size=data.shape[1])
model2.load_state_dict(torch.load('1.genome_saved_models/model2_dl_best.pth'))
model2.eval()  # Set the model to evaluation mode

MyModel(
  (fc1): Linear(in_features=5, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=1, bias=True)
)

In [12]:
print("loss for fold 1: ",squared_hinge_loss(model2(data), y1).item())
print("loss for fold 2: ",squared_hinge_loss(model1(data), y2).item())

loss for fold 1:  1.032637119293213
loss for fold 2:  1.0122628211975098


In [13]:
with torch.no_grad():
    ldas1 = model1(data).numpy().reshape(-1)
    ldas2 = model2(data).numpy().reshape(-1)

In [15]:
seqs   = gen_data_dict('sequence_label_data/genome/signals.gz')
labels = gen_data_dict('sequence_label_data/genome/labels.gz')

header = ['sequenceID', 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_errs', 'fold_2_errs']

for i in range(len(seqs)):
    # generate data
    sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)
    sequence_length = len(sequence)-1

    # vectors of cumulative sums
    y, z = get_cumsum(sequence)

    # get total labels
    fold1_total_labels = len(neg_start_1) + len(pos_start_1)
    fold2_total_labels = len(neg_start_2) + len(pos_start_2)

    # run each lambda and record it into csv file
    row  = [i, fold1_total_labels, fold2_total_labels]

    chpnt_fold1 = opart(10**ldas2[i], sequence)
    chpnt_fold2 = opart(10**ldas1[i], sequence)

    err_1 = error_count(chpnt_fold1, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
    err_2 = error_count(chpnt_fold2, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
    
    row.append(sum(err_1))
    row.append(sum(err_2))

    write_to_csv('1.genome_learning_output/deep.csv', header, row)