In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from ipynb.fs.full.utility_functions import gen_data_dict, get_data, error_count, opart, show_error_rate

np.random.seed(4)
torch.manual_seed(4)

<torch._C.Generator at 0x2497ef13d10>

In [2]:
# sequences
seqs = gen_data_dict('0_sequences_labels/signals.gz')

# target
target_df_1 = pd.read_csv('1_training_data/target_lambda_fold1_base_10.csv')
target_df_2 = pd.read_csv('1_training_data/target_lambda_fold2_base_10.csv')

targets_low_1  = target_df_1.iloc[:, 1:2].to_numpy()
targets_high_1 = target_df_1.iloc[:, 2:3].to_numpy()
targets_low_2  = target_df_2.iloc[:, 1:2].to_numpy()
targets_high_2 = target_df_2.iloc[:, 2:3].to_numpy()

targets_low_1  = torch.FloatTensor(targets_low_1)
targets_high_1 = torch.FloatTensor(targets_high_1)
targets_low_2  = torch.FloatTensor(targets_low_2)
targets_high_2 = torch.FloatTensor(targets_high_2)

y1 = torch.cat((targets_low_1, targets_high_1), dim=1)
y2 = torch.cat((targets_low_2, targets_high_2), dim=1)

In [3]:
# # filter
# idx_fold1 = [idx for idx, item in enumerate(y1_raw) if item[0] > -np.inf and item[1] < np.inf]
# X1 = [seqs[i] for i in idx_fold1]
# y1 = torch.mean(y1_raw[idx_fold1], axis=1).reshape(-1,1)

# idx_fold2 = [idx for idx, item in enumerate(y2_raw) if item[0] > -np.inf and item[1] < np.inf]
# X2 = [seqs[i] for i in idx_fold2]
# y2 = torch.mean(y2_raw[idx_fold2], axis=1).reshape(-1,1)

In [4]:
# print(len(X1), len(X2))
# print(len(y1), len(y2))

In [5]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, extracted_features, hidden_layers, hidden_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size=1, hidden_size=extracted_features, num_layers=1, nonlinearity='tanh', bias=True, batch_first=False)

        # M2
        self.hidden_layers = hidden_layers
        self.hidden_size   = hidden_size

        # Define input layer
        self.input_layer = nn.Linear(extracted_features, hidden_size)
        
        # Define hidden layers
        self.hidden = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers)])
        
        # Define output layer
        self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, x = self.rnn(x)

        # M2
        # Forward pass through input layer
        x = torch.relu(self.input_layer(x))
        
        # Forward pass through hidden layers
        for layer in self.hidden:
            x = torch.relu(layer(x))
        
        # Forward pass through output layer
        x = self.output_layer(x)
        return x

In [6]:
def plot_loss(train_loss, val_loss, train_set_name, test_set_name):
    epochs = range(1, len(train_loss) + 1)
    plt.plot(epochs, train_loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    
    # Find the index of the minimum validation loss
    min_val_loss_index = val_loss.index(min(val_loss))
    min_val_loss = val_loss[min_val_loss_index]
    
    # Mark the minimum validation loss point
    plt.plot(min_val_loss_index + 1, min_val_loss, 'g*', markersize=10, label=f'Min Val epoch: {min_val_loss_index: 3d}')

    plt.title('Train ' + train_set_name + " Validate " + test_set_name)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [7]:
def cv_learn(X, y, extracted_features, n_hiddens, layer_size):
    # Define the number of folds for cross-validation
    kf = KFold(n_splits=2, shuffle=True, random_state=123)

    # loss function
    loss_func = torch.nn.MSELoss()

    # learn best ite
    best_ites = []
    for train_index, val_index in kf.split(X):
        # Split the data into training and validation sets
        X_train_tensor, X_val_tensor = [X[i] for i in train_index], [X[i] for i in val_index]
        y_train_tensor, y_val_tensor = y[train_index], y[val_index]

        # Define your model, loss, and optimizer
        model = RNNModel(extracted_features=extracted_features, hidden_layers=n_hiddens, hidden_size=layer_size)
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Training loop for the specified number of iterations
        train_losses = []
        val_losses = []
        for i in range(50):
            loss_sum = 0.0
            val_loss_sum = 0.0
            for j in range(len(X_train_tensor)):
                optimizer.zero_grad()
                seq = torch.tensor(seqs[j][1]['logratio'].to_numpy(), dtype=torch.float32).reshape(-1, 1)
                output = model(seq)
                loss = loss_func(output[0], y_train_tensor[j])
                loss.backward()
                optimizer.step()
                loss_sum += loss.item()
            train_losses.append(loss_sum / len(X_train_tensor))

            with torch.no_grad():
                for k in range(len(X_val_tensor)):
                    seq = torch.tensor(seqs[k][1]['logratio'].to_numpy(), dtype=torch.float32).reshape(-1, 1)
                    val_output = model(seq)
                    val_loss = loss_func(val_output[0], y_val_tensor[k])
                    val_loss_sum += val_loss.item()
                val_losses.append(val_loss_sum / len(X_val_tensor))
            
            # # print testing
            # print(i, "train_loss, val_loss:", loss_sum / len(X_train_tensor), val_loss_sum / len(X_val_tensor))

        best_ite = np.argmin(val_losses) + 1
        best_ites.append(best_ite)

        # # plot
        # plot_loss(train_losses, val_losses, train_set_name='subtrain', test_set_name='val')
    try:
        best_no_ite = int(np.mean(best_ites))
        return best_no_ite
    finally:
        return 1

In [8]:
def train_model(X, y, extracted_features, n_hiddens, layer_size, n_ites):
    model = RNNModel(extracted_features=extracted_features, hidden_layers=n_hiddens, hidden_size=layer_size)
    loss_func = torch.nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for _ in range(n_ites):
        for i in range(len(X)):
            optimizer.zero_grad()
            seq = torch.Tensor(seqs[i][1]['logratio'].to_numpy()).reshape(-1, 1)
            output = model(seq)
            loss = loss_func(output[0], y[i])
            loss.backward()
            optimizer.step()
    
    return model

In [9]:
def get_df_stat(ldas1, ldas2, err_fold1_df, err_fold2_df, seqs, labels):
    header = ['sequenceID', 'lda_fold1', 'lda_fold2', 'fold_1_total_labels', 'fold_2_total_labels', 'fold1_err', 'fold2_err']
    rows = []
    for i in range(len(seqs)):
        # get total labels
        _, neg_start_1, _, pos_start_1, _, neg_start_2, _, pos_start_2, _ = get_data(i, seqs, labels)
        fold1_total_labels = len(neg_start_1) + len(pos_start_1)
        fold2_total_labels = len(neg_start_2) + len(pos_start_2)

        # round lambda
        ldas1 = [round(num*2)/2 for num in ldas1]
        ldas2 = [round(num*2)/2 for num in ldas2]

        # get err
        fold1_err = err_fold1_df.iloc[i][str(ldas1[i])]
        fold2_err = err_fold2_df.iloc[i][str(ldas2[i])]
        
        # add row to rows
        row = [seqs[i][0], ldas1[i], ldas2[i], fold1_total_labels, fold2_total_labels, fold1_err, fold2_err]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=header)
    return df

In [10]:
def try_model(X1, X2, y1, y2, config, err_fold1_df, err_fold2_df, seqs, labels):
    n_hiddens  = config['n_hiddens']
    layer_size = config['layer_size']
    n_features = config['n_features']

    # best_no_ite_1 = cv_learn(X1, y1, n_features, n_hiddens, layer_size)
    # best_no_ite_2 = cv_learn(X2, y2, n_features, n_hiddens, layer_size)

    model1 = train_model(X1, y1, n_features, n_hiddens, layer_size, 2)
    model2 = train_model(X2, y2, n_features, n_hiddens, layer_size, 2)

    # torch.save(model1.state_dict(), '3_learned_models/3_rnn/model1_' + str(n_features) + '_' + str(n_hiddens) + '_' + str(layer_size) + '.pth')
    # torch.save(model2.state_dict(), '3_learned_models/3_rnn/model2_' + str(n_features) + '_' + str(n_hiddens) + '_' + str(layer_size) + '.pth')

    ldas1 = np.zeros(len(seqs))
    ldas2 = np.zeros(len(seqs))
    with torch.no_grad():
        for i in range(len(seqs)):
            seq = torch.Tensor(seqs[i][1]['logratio'].to_numpy()).reshape(-1,1)
            ldas1[i] = model1(seq).numpy()[0][0]
            ldas2[i] = model2(seq).numpy()[0][0]

    df = get_df_stat(ldas1, ldas2, err_fold1_df, err_fold2_df, seqs, labels)
    return df

In [11]:
# Define the values for n_features, n_hiddens and layer_size
n_features        = [8, 16]
n_hiddens_values  = [0, 1, 2, 3]
layer_size_values = [4, 8, 16, 32]

# Create a list of dictionaries
configs = [{'n_features': f, 'n_hiddens': n, 'layer_size': s} for f in n_features for n in n_hiddens_values for s in layer_size_values]

In [12]:
# getting dataframe of error count
err_fold1_df = pd.read_csv('1_training_data/errors_fold1_base_10.csv')
err_fold2_df = pd.read_csv('1_training_data/errors_fold2_base_10.csv')

# getting sequences and labels data
seqs   = gen_data_dict('0_sequences_labels/signals.gz')
labels = gen_data_dict('0_sequences_labels/labels.gz')

In [13]:
dfs = Parallel(n_jobs=32)(delayed(try_model)(seqs, seqs, y1, y2, configs[i], err_fold1_df, err_fold2_df, seqs, labels) for i in range(0, len(configs)))

In [None]:
avg_acc = []
for i in range(len(dfs)):
    rate1, rate2,_,_,_,_ = show_error_rate(dfs[i])
    avg_acc.append((rate1 + rate2)/2)
    print("n_features: %2d \t n_hiddens: %1d \t layer_size: %2d \t fold1.test: %5.2f \t fold2.test: %5.2f \t avg_acc: %5.2f" % 
          (configs[i]['n_features'], configs[i]['n_hiddens'], configs[i]['layer_size'], rate1, rate2, (rate1 + rate2)/2))

In [None]:
# # Instantiate the model, define custom loss function, and optimizer
# model1 = RNNModel(extracted_features=8, hidden_layers=1, hidden_size=8)
# model2 = RNNModel(extracted_features=8, hidden_layers=1, hidden_size=8)

# squared_hinge_loss = SquaredHingeLoss()
# optimizer1 = optim.Adam(model1.parameters(), lr=0.001)
# optimizer2 = optim.Adam(model2.parameters(), lr=0.001)

# # Training loop
# record_loss1_test = []
# record_loss2_test = []
# min_loss_1_test = float('inf')
# min_loss_2_test = float('inf')
# for epoch in range(1000):
#     # Forward pass
#     outputs1 = torch.tensor([[0.0]])
#     outputs2 = torch.tensor([[0.0]])
#     for i in range(len(seqs)):
#         seq = torch.tensor(seqs[i][1]['logratio'].to_numpy(), dtype=torch.float32).reshape(-1,1)
#         outputs1 += model1(seq)
#         outputs2 += model2(seq)
    
#     # Compute the custom loss
#     loss_1 = squared_hinge_loss(outputs1, y1)
#     loss_2 = squared_hinge_loss(outputs2, y2)
#     loss_1_test = squared_hinge_loss(outputs1, y2)
#     loss_2_test = squared_hinge_loss(outputs2, y1)
    
#     # Backward pass and optimization
#     optimizer1.zero_grad()
#     loss_1.backward()
#     optimizer1.step()

#     optimizer2.zero_grad()
#     loss_2.backward()
#     optimizer2.step()

#     # record
#     record_loss1_test.append(loss_1_test.item())
#     record_loss2_test.append(loss_2_test.item())

#     # save models
#     if loss_1_test < min_loss_1_test:
#         min_loss_1_test = loss_1_test
#         torch.save(model1.state_dict(), '1.genome_saved_models/model1_rnn_best.pth')
    
#     if loss_2_test < min_loss_2_test:
#         min_loss_2_test = loss_2_test
#         torch.save(model2.state_dict(), '1.genome_saved_models/model2_rnn_best.pth')
    
#     # Print the loss every 100 epochs
#     if (epoch) % 1 == 0:
#         print(f'Epoch {epoch:5d}, Loss_1: {loss_1.item():8.4f}, Loss_1_test: {loss_1_test.item():8.4f}, Loss_2: {loss_2.item():8.4f}, Loss_2_test: {loss_2_test.item():8.4f}')

In [None]:
# # Load model1
# model1 = RNNModel()
# model1.load_state_dict(torch.load('1.genome_saved_models/model1_rnn_best.pth'))
# model1.eval()  # Set the model to evaluation mode

# # Load model2
# model2 = RNNModel()
# model2.load_state_dict(torch.load('1.genome_saved_models/model2_rnn_best.pth'))
# model2.eval()  # Set the model to evaluation mode

In [None]:
# ldas1 = np.zeros(len(seqs))
# ldas2 = np.zeros(len(seqs))
# with torch.no_grad():
#     for i in range(len(seqs)):
#         seq = torch.tensor(seqs[i][1]['logratio'].to_numpy(), dtype=torch.float32).reshape(-1,1)
#         ldas1[i] = model1(seq).numpy()[0][0]
#         ldas2[i] = model2(seq).numpy()[0][0]

In [None]:
# seqs   = gen_data_dict('sequence_label_data/genome/signals.gz')
# labels = gen_data_dict('sequence_label_data/genome/labels.gz')

# header = ['sequenceID', 'fold_1_total_labels', 'fold_2_total_labels', 'fold_1_errs', 'fold_2_errs']

# for i in range(len(seqs)):
#     # generate data
#     sequence, neg_start_1, neg_end_1, pos_start_1, pos_end_1, neg_start_2, neg_end_2, pos_start_2, pos_end_2 = get_data(i, seqs=seqs, labels=labels)
#     sequence_length = len(sequence)-1

#     # vectors of cumulative sums
#     y, z = get_cumsum(sequence)

#     # get total labels
#     fold1_total_labels = len(neg_start_1) + len(pos_start_1)
#     fold2_total_labels = len(neg_start_2) + len(pos_start_2)

#     # run each lambda and record it into csv file
#     row  = [i, fold1_total_labels, fold2_total_labels]

#     chpnt_fold1 = opart(10**ldas2[i], sequence)
#     chpnt_fold2 = opart(10**ldas1[i], sequence)

#     err_1 = error_count(chpnt_fold1, neg_start_1, neg_end_1, pos_start_1, pos_end_1)
#     err_2 = error_count(chpnt_fold2, neg_start_2, neg_end_2, pos_start_2, pos_end_2)
    
#     row.append(sum(err_1))
#     row.append(sum(err_2))

#     write_to_csv('1.genome_learning_output/rnn.csv', header, row)