In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

In [2]:
# Set random seed for reproducibility
torch.manual_seed(12345)
np.random.seed(12345)

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, knot):
        super(SimpleNN, self).__init__()
        self.linear_part = nn.Linear(input_size, 1, bias=True)
        self.non_linear_part = nn.Linear(input_size, 1, bias=False)
        self.b2 = nn.Parameter(torch.randn(1))
        self.relu = nn.ReLU()
        self.knot = knot

    def forward(self, x):
        out1 = self.linear_part(x)
        out2 = self.non_linear_part(self.relu(x - self.knot))
        return out1 + out2

In [5]:
category = 'proposed'

In [6]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
# datasets.remove('detailed')

In [7]:
# dataset = 'detailed'
chosen_feature = ['log_range_value', 'log_variance', 'loglog_sum_diff', 'loglog_count']

In [8]:
for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')[['sequenceID'] + chosen_feature]
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train, y_train, X_test
        X_train = features_df_train[chosen_feature].to_numpy()
        y_train = target_df_train.iloc[:, 1:].to_numpy()
        X_test = features_df_test[chosen_feature].to_numpy()

        # Convert data to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

        # Initialize the model, loss function, and optimizer
        input_dim = X_train.shape[1]
        mean = torch.mean(X_train_tensor, dim=0)
        model = SimpleNN(input_dim, mean)
        criterion = SquaredHingeLoss()
        optimizer = optim.Adam(model.parameters())

        # Training with early stopping
        best_train_loss = float('inf')
        patience = 5000
        patience_counter = 0

        for epoch in range(500000):
            model.train()
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Early stopping check
            train_loss = loss.item()
            if train_loss < best_train_loss:
                best_train_loss = train_loss
                best_model = model.state_dict()  # Save the best model state
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                break

        # Load the best model
        model.load_state_dict(best_model)

        # Predict on the test set
        model.eval()
        pred_lldas = model(X_test_tensor).detach().numpy().ravel()

        # Save to CSV
        lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])
        lldas_df.to_csv(f'predictions/{category}.{dataset}.{test_fold}.{len(chosen_feature)}.csv', index=False)