In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from joblib import Parallel, delayed

In [2]:
# Hinged Square Loss: Loss function
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=0):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [3]:
class MLP(nn.Module):
    def __init__(self, input_size, layer_sizes):
        super(MLP, self).__init__()
        layers = []
        prev_size = input_size
        
        # Create hidden layers
        for size in layer_sizes:
            layers.append(nn.Linear(prev_size, size))
            layers.append(nn.ReLU())
            prev_size = size
        
        # Add output layer
        layers.append(nn.Linear(prev_size, 1))  # Output layer

        self.model = nn.Sequential(*layers)  # Combine layers into a sequential model

    def forward(self, x):
        return self.model(x)

In [4]:
folder_path = '../../data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [5]:
def train_and_evaluate(train_idx, val_idx, model, X_train, y_train, max_epochs=20000, patience=20):
    # Initialize the optimizer
    optimizer = torch.optim.Adam(model.parameters())
    
    # Loss function
    criterion = SquaredHingeLoss()
    
    # Early stopping parameters
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    # Training loop
    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(torch.tensor(X_train[train_idx], dtype=torch.float32))
        
        # Compute loss
        loss = criterion(outputs, torch.tensor(y_train[train_idx], dtype=torch.float32))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(torch.tensor(X_train[val_idx], dtype=torch.float32))
            val_loss = criterion(val_outputs, torch.tensor(y_train[val_idx], dtype=torch.float32))
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()  # Save best model
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1
        
        # Stop training if validation loss does not improve
        if patience_counter >= patience:
            break
    
    # Restore best model before returning
    model.load_state_dict(best_model_state)
    return model, best_val_loss


def process_dataset(dataset):
    # Load data
    folds_df = pd.read_csv(f'../../data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../data/{dataset}/features.csv').astype(np.float32)
    target_df = pd.read_csv(f'../../data/{dataset}/targets.csv').astype(np.float32)

    n_layers = [1, 2, 3, 4]
    n_neurons = [1, 2, 4, 8, 16, 32, 64, 128]

    kf = KFold(n_splits=3, shuffle=True, random_state=42)

    # List to track stats
    stats = []

    # Iterate over test folds
    for test_fold in sorted(np.unique(folds_df['fold'])):
        train_indices = folds_df[folds_df['fold'] != test_fold].index
        test_indices = folds_df[folds_df['fold'] == test_fold].index

        X_train = features_df.loc[train_indices].values
        X_test = features_df.loc[test_indices].values
        y_train = target_df.loc[train_indices].values
        y_test = target_df.loc[test_indices].values

        # Standardize X_train
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
        X_test_scaled = scaler.transform(X_test)  # Apply the same transformation to the test data

        best_models = []

        # Train models for each pair of subtrain/val
        for train_idx, val_idx in kf.split(X_train_scaled):
            best_val_loss = float('inf')
            best_model = None
            
            # Try all combinations of layers and neurons
            for n_layer in n_layers:
                for n_neuron in n_neurons:
                    model = MLP(input_size=X_train_scaled.shape[1], layer_sizes=[n_neuron] * n_layer)
                    trained_model, val_loss = train_and_evaluate(train_idx, val_idx, model, X_train_scaled, y_train)
                    
                    # Track the best models based on validation loss
                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        best_model = trained_model
                    
                    # Store stats (dataset, fold, layers, neurons, best_val_loss)
                    stats.append([test_fold, n_layer, n_neuron, best_val_loss.item()])

            best_models.append(best_model)
        
        # Get the outputs for the test set using the 3 best models
        model_outputs = []
        for model in best_models:
            model.eval()
            with torch.no_grad():
                output = model(torch.tensor(X_test_scaled, dtype=torch.float32))
                model_outputs.append(output.numpy())
        
        # Compute the mean output of the best models
        target_mat_pred = np.mean(np.array(model_outputs), axis=0).flatten()
        prediction = pd.DataFrame({'pred': target_mat_pred})
        prediction.to_csv(f"predictions/{dataset}.{test_fold}.csv", index=False)

    # Save the stats to a CSV file
    os.makedirs('stats', exist_ok=True)
    stats_df = pd.DataFrame(stats, columns=["fold", "layers", "neurons", "val_loss"])
    stats_df.to_csv(f"stats/{dataset}.csv", index=False)

In [None]:
Parallel(n_jobs=-1)(delayed(process_dataset)(dataset) for dataset in datasets)