In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

In [2]:
# Set random seed for reproducibility
torch.manual_seed(12345)
np.random.seed(12345)

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 2)
        self.fc2 = nn.Linear(2, 1)
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
category = 'proposed'

In [6]:
# dataset = 'ATAC_JV_adipose'
# chosen_feature = ["max_diff", "loglog_sum_diff", "log_range_value"]

In [7]:
# dataset = 'CTCF_TDH_ENCODE'
# chosen_feature = ["max_diff", "loglog_sum_diff", "mean"]

In [8]:
# dataset = 'detailed'
# chosen_feature = ["log_range_value", "log_variance", "autocorr", "loglog_sum_diff", "log_iqr", "log_unique_count", "log_mean_diff", "log_max_diff", "percentile_75"]

In [9]:
# dataset = 'H3K27ac-H3K4me3_TDHAM_BP'
# chosen_feature = ["log_mean", "log_variance", "log_range_value", "log_unique_count"]

In [10]:
# dataset = 'H3K36me3_AM_immune'
# chosen_feature = ["log_mean", "log_variance", "log_range_value", "log_unique_count", "autocorr"]

In [11]:
# dataset = 'H3K4me3_PGP_immune'
# chosen_feature = ["log_mean", "log_variance", "log_range_value", "log_unique_count", "autocorr", "log_max_diff"]

In [12]:
# dataset = 'H3K4me3_XJ_immune'
# chosen_feature = ["log_mean", "log_variance", "log_range_value", "log_max_diff"]

In [13]:
dataset = 'systematic'
chosen_feature = ["log_range_value", "log_variance", "autocorr", "loglog_sum_diff", "log_iqr", "log_unique_count", "log_mean_diff", "log_max_diff"]

In [14]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')[['sequenceID'] + chosen_feature]
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
    # Split data into training and test sets
    train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
    test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

    features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
    features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
    target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

    # Create X_train, y_train, X_test
    X_train = features_df_train[chosen_feature].to_numpy()
    y_train = target_df_train.iloc[:, 1:].to_numpy()
    X_test = features_df_test[chosen_feature].to_numpy()

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

    # Initialize the model, loss function, and optimizer
    input_dim = X_train.shape[1]
    model = SimpleNN(input_dim)
    criterion = SquaredHingeLoss()
    optimizer = optim.Adam(model.parameters())

    # Training with early stopping
    best_train_loss = float('inf')
    patience = 5000
    patience_counter = 0

    for epoch in range(500000):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Early stopping check
        train_loss = loss.item()
        if train_loss < best_train_loss:
            best_train_loss = train_loss
            best_model = model.state_dict()  # Save the best model state
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            break

    # Load the best model
    model.load_state_dict(best_model)

    # Predict on the test set
    model.eval()
    pred_lldas = model(X_test_tensor).detach().numpy().ravel()

    # Save to CSV
    lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])
    lldas_df.to_csv(f'predictions/{category}.{dataset}.{test_fold}.{len(chosen_feature)}.csv', index=False)