In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from ipynb.fs.full.utility_functions import gen_data_dict, get_data, error_count, opart, SquaredHingeLoss, show_error_rate

np.random.seed(4)
torch.manual_seed(4)

<torch._C.Generator at 0x1a5fbcafcf0>

In [2]:
def normalize_data(data):
    mean = data.mean(dim=0)
    std  = data.std(dim=0)
    norm_data = (data - mean) / std
    return norm_data

In [3]:
# data (inputs)

# chosen_feature = ['std_deviation', 'count', 'sum_diff', 'range_value', 'abs_skewness']
chosen_feature = ['count', 'sum_diff', 'range_value']
data = torch.FloatTensor(pd.read_csv('1_training_data/seq_features.csv').iloc[:, 1:][chosen_feature].to_numpy())
X = normalize_data(data)

In [4]:
# data (targets)
target_df_1 = pd.read_csv('1_training_data/target_lambda_fold1_base_e.csv')
target_df_2 = pd.read_csv('1_training_data/target_lambda_fold2_base_e.csv')

targets_low_1  = torch.FloatTensor(target_df_1.iloc[:, 1:2].to_numpy())
targets_high_1 = torch.FloatTensor(target_df_1.iloc[:, 2:3].to_numpy())
targets_low_2  = torch.FloatTensor(target_df_2.iloc[:, 1:2].to_numpy())
targets_high_2 = torch.FloatTensor(target_df_2.iloc[:, 2:3].to_numpy())

y1_raw = torch.cat((targets_low_1, targets_high_1), dim=1)
y2_raw = torch.cat((targets_low_2, targets_high_2), dim=1)

In [5]:
# filter
idx_fold1 = [idx for idx, item in enumerate(y1_raw) if item[0] > -7 and item[1] < 7]
X1 = X[idx_fold1]
y1 = torch.mean(y1_raw[idx_fold1], axis=1).reshape(-1,1)

idx_fold2 = [idx for idx, item in enumerate(y2_raw) if item[0] > -7 and item[1] < 7]
X2 = X[idx_fold2]
y2 = torch.mean(y2_raw[idx_fold2], axis=1).reshape(-1,1)

In [6]:
print(len(X1), len(y1))
print(len(X2), len(y2))

190 190
313 313


In [7]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_layers, hidden_size):
        super(MyModel, self).__init__()
        self.hidden_layers = hidden_layers
        self.hidden_size   = hidden_size
        
        # Define input layer
        self.input_layer = nn.Linear(input_size, hidden_size)
        
        # Define hidden layers
        self.hidden = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers)])
        
        # Define output layer
        self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Forward pass through input layer
        x = torch.relu(self.input_layer(x))
        
        # Forward pass through hidden layers
        for layer in self.hidden:
            x = torch.relu(layer(x))
        
        # Forward pass through output layer
        x = self.output_layer(x)
        return x

In [8]:
def cv_learn(X1, X2, y1, y2, n_hiddens, layer_size):
    # Define the number of folds for cross-validation
    kf = KFold(n_splits=2, shuffle=True, random_state=123)

    # loss function
    loss_func = torch.nn.MSELoss()

    # learn best ite from fold 1
    best_ites_1 = []
    for train_index, val_index in kf.split(X1):
        # Split the data into training and validation sets
        X1_train_tensor, X1_val_tensor = X1[train_index], X1[val_index]
        y1_train_tensor, y1_val_tensor = y1[train_index], y1[val_index]

        # Define your model, loss, and optimizer
        model1 = MyModel(input_size=X1_train_tensor.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
        optimizer1 = optim.Adam(model1.parameters(), lr=0.001)

        # Training loop for the specified number of iterations
        val_losses_1 = []
        for _ in range(5000):
            loss1      = loss_func(model1(X1_train_tensor), y1_train_tensor)
            val_loss_1 = loss_func(model1(X1_val_tensor),   y1_val_tensor)
            optimizer1.zero_grad()
            loss1.backward()
            optimizer1.step()

            val_losses_1.append(val_loss_1.item())
        
        best_ite_1 = np.argmin(val_losses_1) + 1
        best_ites_1.append(best_ite_1)

    # learn best ite from fold 2
    best_ites_2 = []
    for train_index, val_index in kf.split(X2):
        # Split the data into training and validation sets
        X2_train_tensor, X2_val_tensor = X2[train_index], X2[val_index]
        y2_train_tensor, y2_val_tensor = y2[train_index], y2[val_index]

        # Define your model, loss, and optimizer
        model2 = MyModel(input_size=X2_train_tensor.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
        optimizer2 = optim.Adam(model2.parameters(), lr=0.001)

        # Training loop for the specified number of iterations
        val_losses_2 = []
        for _ in range(5000):
            loss2      = loss_func(model2(X2_train_tensor), y2_train_tensor)
            val_loss_2 = loss_func(model2(X2_val_tensor),   y2_val_tensor)
            optimizer2.zero_grad()
            loss2.backward()
            optimizer2.step()

            val_losses_2.append(val_loss_2.item())
        
        best_ite_2 = np.argmin(val_losses_2) + 1
        best_ites_2.append(best_ite_2)

    best_no_ite_1 = int(np.mean(best_ites_1))
    best_no_ite_2 = int(np.mean(best_ites_2))
    
    return best_no_ite_1, best_no_ite_2

In [9]:
def train_model(X, y, n_hiddens, layer_size, n_ites):
    model = MyModel(input_size=X.shape[1], hidden_layers=n_hiddens, hidden_size=layer_size)
    # loss_func = SquaredHingeLoss(margin=1, low_lim=-7, high_lim=7)
    loss_func = torch.nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for _ in range(n_ites):
        # Forward pass
        outputs = model(X)
        
        # Compute the custom loss
        loss = loss_func(outputs, y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return model

In [10]:
def get_df_stat(ldas1, ldas2, seqs, labels):
    header = ['sequenceID', 'log_lambda_test_fold1', 'log_lambda_test_fold2', 'target_1', 'target_2', 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_fp_errs', 'fold_1_fn_errs', 'fold_1_tp', 'fold_1_tn', 'fold_2_fp_errs', 'fold_2_fn_errs', 'fold_2_tp', 'fold_2_tn']
    rows = []
    for i in range(len(seqs)):
        # generate data
        sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)

        # get total labels
        fold1_total_labels = len(neg_start_1) + len(pos_start_1)
        fold2_total_labels = len(neg_start_2) + len(pos_start_2)

        # run each lambda and record it into csv file
        row  = [seqs[i][0], ldas2[i], ldas1[i], list(y1_raw[i].numpy()), list(y2_raw[i].numpy()), fold1_total_labels, fold2_total_labels]

        chpnt_fold1 = opart(np.exp(ldas2[i]), sequence)
        chpnt_fold2 = opart(np.exp(ldas1[i]), sequence)

        err_1 = error_count(chpnt_fold1, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
        err_2 = error_count(chpnt_fold2, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
        
        for err in [err_1, err_2]:
            row.extend(err[:4])

        rows.append(row)
    
    df = pd.DataFrame(rows, columns=header)
    return df

In [11]:
def try_model(X1, X2, y1, y2, config):
    n_hiddens  = config['n_hiddens']
    layer_size = config['layer_size']

    best_no_ite_1, best_no_ite_2 = cv_learn(X1, X2, y1, y2, n_hiddens, layer_size)

    model1 = train_model(X1, y1, n_hiddens, layer_size, best_no_ite_1)
    model2 = train_model(X2, y2, n_hiddens, layer_size, best_no_ite_2)

    torch.save(model1.state_dict(), '3_learned_models/2_deep/model1_' + str(n_hiddens) + '_' + str(layer_size) + '.pth')
    torch.save(model2.state_dict(), '3_learned_models/2_deep/model2_' + str(n_hiddens) + '_' + str(layer_size) + '.pth')

    with torch.no_grad():
        ldas1 = model1(X).numpy().reshape(-1)
        ldas2 = model2(X).numpy().reshape(-1)

    seqs   = gen_data_dict('0_sequences_labels/signals.gz')
    labels = gen_data_dict('0_sequences_labels/labels.gz')
    df = get_df_stat(ldas1, ldas2, seqs, labels)

    return df

In [12]:
# Define the values for n_hiddens and layer_size
n_hiddens_values  = [0, 1, 2, 3]
layer_size_values = [4, 8, 16, 32]

# Create a list of dictionaries
configs = [{'n_hiddens': n, 'layer_size': s} for n in n_hiddens_values for s in layer_size_values]

In [13]:
# choose best model
dfs = Parallel(n_jobs=32)(delayed(try_model)(X1, X2, y1, y2, configs[i]) for i in range(0, len(configs)))

In [14]:
avg_acc = []
for i in range(len(dfs)):
    rate1, rate2,_,_,_,_ = show_error_rate(dfs[i])
    avg_acc.append((rate1 + rate2)/2)
    print("n_hiddens: %1d \t layer_size: %2d \t fold1.test: %5.2f \t fold2.test: %5.2f \t avg_acc: %5.2f" % 
          (configs[i]['n_hiddens'], configs[i]['layer_size'], rate1, rate2, (rate1 + rate2)/2))

n_hiddens: 0 	 layer_size:  4 	 fold1.test: 78.86 	 fold2.test: 77.50 	 avg_acc: 78.18
n_hiddens: 0 	 layer_size:  8 	 fold1.test: 78.72 	 fold2.test: 76.15 	 avg_acc: 77.44
n_hiddens: 0 	 layer_size: 16 	 fold1.test: 79.26 	 fold2.test: 79.23 	 avg_acc: 79.24
n_hiddens: 0 	 layer_size: 32 	 fold1.test: 80.19 	 fold2.test: 80.77 	 avg_acc: 80.48
n_hiddens: 1 	 layer_size:  4 	 fold1.test: 77.39 	 fold2.test: 79.23 	 avg_acc: 78.31
n_hiddens: 1 	 layer_size:  8 	 fold1.test: 80.05 	 fold2.test: 80.38 	 avg_acc: 80.22
n_hiddens: 1 	 layer_size: 16 	 fold1.test: 80.98 	 fold2.test: 78.65 	 avg_acc: 79.82
n_hiddens: 1 	 layer_size: 32 	 fold1.test: 79.79 	 fold2.test: 80.19 	 avg_acc: 79.99
n_hiddens: 2 	 layer_size:  4 	 fold1.test: 81.12 	 fold2.test: 77.12 	 avg_acc: 79.12
n_hiddens: 2 	 layer_size:  8 	 fold1.test: 79.65 	 fold2.test: 76.54 	 avg_acc: 78.10
n_hiddens: 2 	 layer_size: 16 	 fold1.test: 78.19 	 fold2.test: 80.96 	 avg_acc: 79.58
n_hiddens: 2 	 layer_size: 32 	 fold1.test:

In [15]:
df = dfs[np.argmax(avg_acc)]
df.to_csv('2_learning_record/deep.csv', index=False)

In [18]:
show_error_rate(df)

(80.18617021276596, 80.76923076923077, 752, 520, 149, 100)