In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import os

In [2]:
# Set random seed for reproducibility
torch.manual_seed(12345)
np.random.seed(12345)

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
class MLPRegressor(nn.Module):
    def __init__(self, n):
        super(MLPRegressor, self).__init__()
        # Learnable parameters
        self.w1 = nn.Parameter(torch.ones(1, n))  # size (1, n) to broadcast across batch
        self.w2 = nn.Parameter(torch.ones(1, n))  # size (1, n) to broadcast across batch
        self.w3 = nn.Parameter(torch.randn(1))     # scalar intercept
        
    def forward(self, X):
        # X is expected to have size (batch_size, n)
        if X.size(1) != self.w1.size(1):
            raise ValueError(f"Expected input size (_, {self.w1.size(1)}), but got {X.size()}")
        
        # Calculate y = w1 * X^w2 + w3
        # Note: broadcasting of (1, n) w1 and w2 across (batch_size, n) X
        y = self.w1 * (X ** self.w2) + self.w3
        return y

In [5]:
category = 'proposed'

In [6]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [7]:
# choose number of feature
n_features = 1

In [8]:
for dataset in datasets:

    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features_sorted.csv')
    chosen_feature = features_df.columns[1:n_features+1]
    features_df = features_df[['sequenceID'] + list(chosen_feature)]

    for test_fold in range(1, np.unique(folds_df['fold']).__len__()+1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train, y_train, X_test
        X_train = features_df_train[chosen_feature].to_numpy()
        y_train = target_df_train.iloc[:,1:].to_numpy()
        X_test = features_df_test[chosen_feature].to_numpy()

        # Preprocess by scaling the data with MinMaxScaler
        scaler = MinMaxScaler(feature_range=(1, 10))
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Convert data to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

        # Initialize the model, loss function, and optimizer
        input_dim = X_train.shape[1]
        model = MLPRegressor(input_dim)
        criterion = SquaredHingeLoss()
        optimizer = optim.Adam(model.parameters())

        # Training with early stopping based on training loss
        best_train_loss = float('inf')
        patience = 500
        patience_counter = 0

        for epoch in range(20000):
            model.train()
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Training loss check
            train_loss = loss.item()
            
            if train_loss < best_train_loss:
                best_train_loss = train_loss
                best_model = model.state_dict()  # Save the best model state
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                break

        # Load the best model
        model.load_state_dict(best_model)

        # Predict on the test set
        model.eval()
        pred_lldas = model(X_test_tensor).detach().numpy().ravel()

        # Save to CSV
        lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])
        lldas_df.to_csv(f'predictions/{category}.{dataset}.{test_fold}.{len(chosen_feature)}.csv', index=False)