In [1]:
import pandas as pd
import lzma
import torch
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
dataset = 'detailed'

input_size = 1
hidden_size = 4
num_layers = 1

patience = 50
max_epochs = 1000

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)  # Use GRU instead of LSTM
        self.fc = nn.Linear(hidden_size, 1)  # Output layer

    def forward(self, x):               # x shape: (batch_size, seq_length, input_size)
        gru_out, _ = self.gru(x)        # gru_out shape: (batch_size, seq_length, hidden_size)
        last_out = gru_out[:, -1, :]     # last_out shape: (batch_size, hidden_size)
        x = self.fc(last_out)            # x shape: (batch_size, 1)
        x = 10 * torch.tanh(x)           # Apply 10 * tanh() to the output
        return x

In [5]:
# Function to test the model
def test_model(model, inputs):
    model.eval()
    predictions = []

    with torch.no_grad():
        for seq_input in inputs:
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)
            output_seq = model(seq_input)
            predictions.append(output_seq.item())

    return predictions

In [6]:
# Function to compute loss value
def get_loss_value(model, test_seqs, y_test, criterion):
    total_test_loss = 0
    with torch.no_grad():
        for i, seq_input in enumerate(test_seqs):
            target = y_test[i].unsqueeze(0)  # Assuming y_test has been defined similarly to y_train/y_val
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)
            output_seq = model(seq_input)
            loss = criterion(output_seq, target.unsqueeze(-1))
            total_test_loss += loss.item()

    avg_test_loss = total_test_loss / len(test_seqs)
    return avg_test_loss

In [7]:
file_path = f'../../sequence_data/{dataset}/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [None]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

for test_fold in [2,3,4,5,6]:
    # Split data into training and test sets
    train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
    test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

    # train sequences
    train_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(train_ids)]

    # test sequences
    test_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(test_ids)]

    # target
    target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]
    y_train = torch.tensor(target_df_train.iloc[:, 1:].to_numpy())
    target_df_test = target_df[target_df['sequenceID'].isin(test_ids)]
    y_test = torch.tensor(target_df_test.iloc[:, 1:].to_numpy())

    # Split into subtrain and validation sets (80% subtrain, 20% validation)
    train_seqs, val_seqs, y_train, y_val = train_test_split(train_seqs, y_train, test_size=0.2, random_state=42)

    # Initialize the model, loss function, and optimizer
    model = GRUModel(input_size, hidden_size, num_layers)
    criterion = SquaredHingeLoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Initialize variables for best loss and best model
    best_train_loss = float('inf')    # Set best training loss to infinity initially
    best_val_loss = float('inf')      # Set best validation loss to infinity initially
    best_test_loss = float('inf')     # Store the test loss corresponding to best validation loss
    patience_counter = 0              # Counter for early stopping
    best_model_state = None           # Variable to store the best model parameters

    # Training loop
    for epoch in range(max_epochs):
        combined = list(zip(train_seqs, y_train))   # Combine sequences and targets
        random.shuffle(combined)                    # Shuffle both train_seqs and y_train together
        train_seqs, y_train = zip(*combined)        # Unzip them back into separate lists

        total_train_loss = 0
        nan_flag = False  # Flag to detect NaN loss

        # Train on subtrain data
        model.train()
        for i, seq_input in enumerate(train_seqs):
            # Prepare input and target for the current sequence
            target = y_train[i].unsqueeze(0)  # Get the corresponding target for the sequence

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)  # Shape: (1, seq_length, input_size)
            output_seq = model(seq_input)                     # Get model output for the sequence
            loss = criterion(output_seq, target.unsqueeze(-1))  # Calculate loss

            # Check for NaN loss
            if torch.isnan(loss).any():
                print(f"NaN loss detected at Epoch [{epoch}], Step [{i}]")
                nan_flag = True
                break  # Exit the training loop if NaN is encountered

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        if nan_flag:
            break  # Stop training if NaN was encountered

        # Calculate average training loss
        avg_train_loss = total_train_loss / len(train_seqs)

        # Calculate validation and test losses
        model.eval()  # Set model to evaluation mode
        avg_val_loss = get_loss_value(model, val_seqs, y_val, criterion)
        avg_test_loss = get_loss_value(model, test_seqs, y_test, criterion)

        if epoch % 10 == 0:
            print(f'Test fold {test_fold} \t Epoch [{epoch:3d}] \t Avg Train Loss: {avg_train_loss:.8f} \t Avg Val Loss: {avg_val_loss:.8f} \t Avg Test Loss: {avg_test_loss:.8f}')

        # Early stopping logic based on validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss  # Update best validation loss
            best_train_loss = avg_train_loss  # Store the training loss when validation is best
            best_test_loss = avg_test_loss    # Store the test loss corresponding to best validation loss
            patience_counter = 0              # Reset patience counter

            # Save the best model parameters in memory
            best_model_state = model.state_dict()  # Store the model parameters
        else:
            patience_counter += 1  # Increment patience counter

        # Stop training if patience is exceeded
        if patience_counter >= patience:
            print(f'Early stopping triggered after {epoch + 1} epochs.')
            break

    # After training, you can restore the best model parameters if needed
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model.eval()  # Set the model to evaluation mode

    # Print the best train, validation, and corresponding test losses
    print(f'Best Train Loss: {best_train_loss:.8f}')
    print(f'Best Validation Loss: {best_val_loss:.8f}')
    print(f'Test Loss associated with Best Validation Loss: {best_test_loss:.8f}')

    # Test the model and collect outputs
    pred_lldas = test_model(model, test_seqs)

    # Save to CSV
    lldas_df = pd.DataFrame(list(zip(test_ids, pred_lldas)), columns=['sequenceID', 'llda'])
    lldas_df.to_csv(f'predictions/gru.{dataset}.{input_size}.{hidden_size}.{num_layers}.{patience}.{test_fold}.csv', index=False)

Test fold 2 	 Epoch [  0] 	 Avg Train Loss: 0.47208090 	 Avg Val Loss: 0.36157262 	 Avg Test Loss: 0.53990409
Test fold 2 	 Epoch [ 10] 	 Avg Train Loss: 0.34616268 	 Avg Val Loss: 0.29676396 	 Avg Test Loss: 0.35606582
Test fold 2 	 Epoch [ 20] 	 Avg Train Loss: 0.28579654 	 Avg Val Loss: 0.27218027 	 Avg Test Loss: 0.37103895
Test fold 2 	 Epoch [ 30] 	 Avg Train Loss: 0.27276126 	 Avg Val Loss: 0.25489637 	 Avg Test Loss: 0.26708794
Test fold 2 	 Epoch [ 40] 	 Avg Train Loss: 0.26106584 	 Avg Val Loss: 0.23030158 	 Avg Test Loss: 0.26486010
Test fold 2 	 Epoch [ 50] 	 Avg Train Loss: 0.25655184 	 Avg Val Loss: 0.23941812 	 Avg Test Loss: 0.26095747
Test fold 2 	 Epoch [ 60] 	 Avg Train Loss: 0.25374391 	 Avg Val Loss: 0.24799877 	 Avg Test Loss: 0.25933223
Test fold 2 	 Epoch [ 70] 	 Avg Train Loss: 0.21546298 	 Avg Val Loss: 0.17580137 	 Avg Test Loss: 0.20554199
Test fold 2 	 Epoch [ 80] 	 Avg Train Loss: 0.20073817 	 Avg Val Loss: 0.18560189 	 Avg Test Loss: 0.22614244
Test fold 