In [1]:
import pandas as pd
import lzma
import torch
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
dataset = 'detailed'

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, nonlinearity='relu')
        self.fc1 = nn.Linear(hidden_size, 8)  # First fully connected layer
        self.fc2 = nn.Linear(8, 1)            # Second fully connected layer

    def forward(self, x):                    # x shape: (batch_size, seq_length, input_size)
        rnn_out, _ = self.rnn(x)             # rnn_out shape: (batch_size, seq_length, hidden_size)
        last_out = rnn_out[:, -1, :]         # last_out shape: (batch_size, hidden_size)
        x = self.fc1(last_out)               # x shape: (batch_size, 8)
        x = torch.relu(x)                    # Apply ReLU after fc1
        x = self.fc2(x)                      # x shape: (batch_size, 1)
        x = 10 * torch.tanh(x)               # Apply 10 * tanh() to the final output
        return x

In [5]:
# Function to test the model
def test_model(model, inputs):
    model.eval()
    predictions = []

    with torch.no_grad():
        for seq_input in inputs:
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)
            output_seq = model(seq_input)
            predictions.append(output_seq.item())

    return predictions

In [6]:
def get_loss_value(model, seqs, y, criterion):
    model.eval()
    total_loss = 0  # Total validation loss
    with torch.no_grad():
        for i, seq_input in enumerate(seqs):
            target = y[i].unsqueeze(0)
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)  # Shape: (1, seq_length, input_size)
            output_seq = model(seq_input)
            loss = criterion(output_seq, target.unsqueeze(-1))
            total_loss += loss.item()

    average_loss = total_loss / len(seqs)
    return average_loss

In [7]:
file_path = f'../../sequence_data/{dataset}/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [None]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
    # Split data into training and test sets
    train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
    test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

    # train sequences
    train_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(train_ids)]

    # test sequences
    test_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(test_ids)]

    # target
    target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]
    target_df_test = target_df[target_df['sequenceID'].isin(test_ids)]
    y_train = torch.tensor(target_df_train.iloc[:, 1:].to_numpy())
    y_test = torch.tensor(target_df_test.iloc[:, 1:].to_numpy())

    # Split train into subtrain and validation (80% subtrain, 20% validation)
    train_seqs, val_seqs, y_train, y_val = train_test_split(train_seqs, y_train, test_size=0.2, random_state=42)

    # Initialize the model, loss function, and optimizer
    model = RNNModel(1, 4, 1)
    criterion = SquaredHingeLoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Initialize variables for best validation loss and best model
    best_val_loss = float('inf')    # Set best validation loss to infinity initially
    patience_counter = 0            # Counter for early stopping
    patience = 50                   # Number of epochs to wait for improvement before stopping
    best_model_state = None         # Variable to store the best model parameters

    # Training loop
    for epoch in range(1000):
        # Shuffle both train_seqs and y_train together
        combined = list(zip(train_seqs, y_train))
        random.shuffle(combined)
        train_seqs, y_train = zip(*combined)

        total_loss = 0  # Total training loss

        # Training
        model.train()
        for i, seq_input in enumerate(train_seqs):
            # Prepare input and target for the current sequence
            target = y_train[i].unsqueeze(0)  # Get the corresponding target for the sequence

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)  # Shape: (1, seq_length, input_size)
            output_seq = model(seq_input)                     # Get model output for the sequence
            loss = criterion(output_seq, target.unsqueeze(-1))  # Calculate loss

            # Stop training if loss is NaN
            if torch.isnan(loss):
                print(f'Stopping training at epoch {epoch} due to NaN loss.')
                break

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Calculate average loss for this epoch (training)
        average_loss = total_loss / len(train_seqs)

        # Validation step
        average_val_loss = get_loss_value(model, val_seqs, y_val, criterion)
        average_test_loss = get_loss_value(model, test_seqs, y_test, criterion)

        if epoch % 1 == 0:
            print(f'Test fold {test_fold} \t Epoch [{epoch:3d}] \t Train Loss: {average_loss:.8f} \t Val Loss: {average_val_loss:.8f} \t Test Loss: {average_test_loss:.8f}')

        # Early stopping logic based on validation loss
        if average_val_loss < best_val_loss:
            best_val_loss = average_val_loss  # Update best validation loss
            patience_counter = 0  # Reset patience counter

            # Save the best model parameters in memory
            best_model_state = model.state_dict()  # Store the model parameters
        else:
            patience_counter += 1  # Increment patience counter

        # Stop training if patience is exceeded or NaN is detected
        if patience_counter >= patience or torch.isnan(loss):
            if torch.isnan(loss):
                print(f'Training stopped due to NaN loss at epoch {epoch}.')
            else:
                print(f'Early stopping triggered after {epoch + 1} epochs.')
            break


    # After training, you can restore the best model parameters if needed
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model.eval()  # Set the model to evaluation mode
    
    # Test the model and collect outputs
    pred_lldas = test_model(model, test_seqs)

    # Save to CSV
    lldas_df = pd.DataFrame(list(zip(test_ids, pred_lldas)), columns=['sequenceID', 'llda'])
    lldas_df.to_csv(f'predictions/proposed.{dataset}.{test_fold}.csv', index=False)

Test fold 1 	 Epoch [  0] 	 Train Loss: 0.49327874 	 Val Loss: 0.46162487 	 Test Loss: 0.46990087
Test fold 1 	 Epoch [  1] 	 Train Loss: 0.45597373 	 Val Loss: 0.45957914 	 Test Loss: 0.46985079
Test fold 1 	 Epoch [  2] 	 Train Loss: 0.44971927 	 Val Loss: 0.48305370 	 Test Loss: 0.52123624
Test fold 1 	 Epoch [  3] 	 Train Loss: 0.44974355 	 Val Loss: 0.46977347 	 Test Loss: 0.45234843
Test fold 1 	 Epoch [  4] 	 Train Loss: 0.44996938 	 Val Loss: 0.45818097 	 Test Loss: 0.45487897
Test fold 1 	 Epoch [  5] 	 Train Loss: 0.45067232 	 Val Loss: 0.46261112 	 Test Loss: 0.45205993
