In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

In [2]:
# Set random seed for reproducibility
torch.manual_seed(12345)
np.random.seed(12345)

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        interval_length = high - low  # Length of the interval
        loss = loss * (1 + 1/interval_length)
        return torch.mean(torch.square(loss))

In [4]:
class MLPRegressor(nn.Module):
    def __init__(self, layer_sizes):
        super(MLPRegressor, self).__init__()
        self.layers = nn.ModuleList()
        
        # Create input layer
        self.layers.append(nn.Linear(layer_sizes[0], layer_sizes[1]))
        
        # Create hidden layers
        for i in range(1, len(layer_sizes) - 2):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
        
        # Create output layer
        self.layers.append(nn.Linear(layer_sizes[-2], layer_sizes[-1]))
    
    def forward(self, x):
        for i in range(len(self.layers) - 1):
            x = torch.relu(self.layers[i](x))
        x = self.layers[-1](x)
        return x

In [5]:
category = 'proposed'

In [6]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [7]:
chosen_feature = ['loglog_count']
# chosen_feature = ['loglog_count', 'log_variance']
# chosen_feature = ['loglog_count', 'log_variance', 'log_range_value', 'loglog_sum_diff']

In [8]:
for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')[['sequenceID'] + chosen_feature]
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    for test_fold in range(1, np.unique(folds_df['fold']).__len__()+1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # create X_train, y_train, X_test
        X_train = features_df_train[chosen_feature].to_numpy()
        y_train = target_df_train.iloc[:,1:].to_numpy()
        X_test = features_df_test[chosen_feature].to_numpy()

        # Preprocess by standardizing the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Split train set into subtrain and validation (8:2 ratio)
        X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=12345)

        # Convert data to PyTorch tensors
        X_subtrain_tensor = torch.tensor(X_subtrain, dtype=torch.float32)
        y_subtrain_tensor = torch.tensor(y_subtrain, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

        # Initialize the model, loss function, and optimizer
        input_dim = X_subtrain.shape[1]
        model = MLPRegressor([input_dim, 50, 1])
        criterion = SquaredHingeLoss()
        optimizer = optim.Adam(model.parameters())

        # Training with early stopping
        best_val_loss = float('inf')
        patience = 500
        patience_counter = 0

        for epoch in range(20000):
            model.train()
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_subtrain_tensor)
            loss = criterion(outputs, y_subtrain_tensor)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Validation step
            model.eval()
            val_outputs = model(X_val_tensor)
            val_loss = criterion(val_outputs, y_val_tensor).item()
            
            # Early stopping check
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model.state_dict()  # Save the best model state
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                break

        # Load the best model
        model.load_state_dict(best_model)

        # Predict on the test set
        model.eval()
        pred_lldas = model(X_test_tensor).detach().numpy().ravel()

        # save to csv
        lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])
        lldas_df.to_csv(f'predictions/{category}.{dataset}.{test_fold}.{len(chosen_feature)}.csv', index=False)