In [1]:
import pandas as pd
import lzma
import torch
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
dataset = 'detailed'
test_fold = 1

input_size = 2
hidden_size = 8
num_layers = 2

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Output layer

    def forward(self, x):               # x shape: (batch_size, seq_length, input_size)
        lstm_out, _ = self.lstm(x)      # lstm_out shape: (batch_size, seq_length, hidden_size)
        last_out = lstm_out[:, -1, :]   # last_out shape: (batch_size, hidden_size)
        x = self.fc(last_out)           # x shape: (batch_size, 1)
        x = 7 * torch.tanh(x)           # Apply 7 * tanh() to the output
        return x

In [5]:
# Function to test the model
def test_model(model, inputs):
    model.eval()
    predictions = []

    with torch.no_grad():
        for seq_input in inputs:
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)
            output_seq = model(seq_input)
            predictions.append(output_seq.item())

    return predictions

In [6]:
# Function to compute loss value
def get_loss_value(model, test_seqs, y_test, criterion):
    total_test_loss = 0
    with torch.no_grad():
        for i, seq_input in enumerate(test_seqs):
            target = y_test[i].unsqueeze(0)  # Assuming y_test has been defined similarly to y_train/y_val
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)
            output_seq = model(seq_input)
            loss = criterion(output_seq, target.unsqueeze(-1))
            total_test_loss += loss.item()

    avg_test_loss = total_test_loss / len(test_seqs)
    return avg_test_loss

In [7]:
file_path = f'../../sequence_data/{dataset}/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [8]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

# Split data into training and test sets
train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

# train sequences
train_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(train_ids)]

# test sequences
test_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(test_ids)]

# target
target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]
y_train = torch.tensor(target_df_train.iloc[:, 1:].to_numpy())
target_df_test = target_df[target_df['sequenceID'].isin(test_ids)]
y_test = torch.tensor(target_df_test.iloc[:, 1:].to_numpy())

# Split into subtrain and validation sets (80% subtrain, 20% validation)
train_seqs, val_seqs, y_train, y_val = train_test_split(train_seqs, y_train, test_size=0.2, random_state=42)

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, num_layers)
criterion = SquaredHingeLoss()
optimizer = torch.optim.Adam(model.parameters())

# Initialize variables for best loss and best model
best_val_loss = float('inf')    # Set best validation loss to infinity initially
patience_counter = 0            # Counter for early stopping
patience = 50                   # Number of epochs to wait for improvement before stopping
best_model_state = None         # Variable to store the best model parameters

# Training loop
for epoch in range(1000):
    combined = list(zip(train_seqs, y_train))   # Combine sequences and targets
    random.shuffle(combined)                    # Shuffle both train_seqs and y_train together
    train_seqs, y_train = zip(*combined)        # Unzip them back into separate lists

    total_train_loss = 0
    total_val_loss = 0
    nan_flag = False  # Flag to detect NaN loss

    # Train on subtrain data
    model.train()
    for i, seq_input in enumerate(train_seqs):
        # Prepare input and target for the current sequence
        target = y_train[i].unsqueeze(0)  # Get the corresponding target for the sequence

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        seq_input = seq_input.unsqueeze(0).unsqueeze(-1)  # Shape: (1, seq_length, input_size)
        output_seq = model(seq_input)                     # Get model output for the sequence
        loss = criterion(output_seq, target.unsqueeze(-1))  # Calculate loss

        # Check for NaN loss
        if torch.isnan(loss).any():
            print(f"NaN loss detected at Epoch [{epoch}], Step [{i}]. Saving model before NaN occurred.")
            torch.save(best_model_state, 'model_before_nan.pth')  # Save model state before NaN loss
            nan_flag = True
            break  # Exit the training loop if NaN is encountered

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    if nan_flag:
        break  # Stop training if NaN was encountered

    # Calculate validation loss
    model.eval()  # Set model to evaluation mode
    avg_train_loss = total_train_loss / len(train_seqs)
    avg_val_loss = get_loss_value(model, val_seqs, y_val, criterion)
    avg_test_loss = get_loss_value(model, test_seqs, y_test, criterion)
    if epoch % 10 == 0:
        print(f'Test fold {test_fold} \t Epoch [{epoch:3d}] \t Avg Train Loss: {avg_train_loss:.8f} \t Avg Val Loss: {avg_val_loss:.8f} \t Avg Test Loss: {avg_test_loss:.8f}')

    # Early stopping logic based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss  # Update best validation loss
        patience_counter = 0          # Reset patience counter

        # Save the best model parameters in memory
        best_model_state = model.state_dict()  # Store the model parameters
    else:
        patience_counter += 1  # Increment patience counter

    # Stop training if patience is exceeded
    if patience_counter >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs.')
        break

# After training, you can restore the best model parameters if needed
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    model.eval()  # Set the model to evaluation mode

# Test the model and collect outputs
pred_lldas = test_model(model, test_seqs)

# Save to CSV
lldas_df = pd.DataFrame(list(zip(test_ids, pred_lldas)), columns=['sequenceID', 'llda'])
lldas_df.to_csv(f'predictions/proposed.{dataset}.{input_size}.{hidden_size}.{num_layers}.{patience}.{test_fold}.csv', index=False)

Test fold 1 	 Epoch [  0] 	 Avg Train Loss: 0.45618700 	 Avg Val Loss: 0.45895361 	 Avg Test Loss: 0.46192850
Test fold 1 	 Epoch [ 10] 	 Avg Train Loss: 0.43973569 	 Avg Val Loss: 0.45077602 	 Avg Test Loss: 0.46565868
Test fold 1 	 Epoch [ 20] 	 Avg Train Loss: 0.34781923 	 Avg Val Loss: 0.37272225 	 Avg Test Loss: 0.30911975
Test fold 1 	 Epoch [ 30] 	 Avg Train Loss: 0.30251557 	 Avg Val Loss: 0.34693116 	 Avg Test Loss: 0.31927545
Test fold 1 	 Epoch [ 40] 	 Avg Train Loss: 0.27155423 	 Avg Val Loss: 0.33058497 	 Avg Test Loss: 0.31131509
Test fold 1 	 Epoch [ 50] 	 Avg Train Loss: 0.26768925 	 Avg Val Loss: 0.39693023 	 Avg Test Loss: 0.35160881
Test fold 1 	 Epoch [ 60] 	 Avg Train Loss: 0.25422475 	 Avg Val Loss: 0.43541726 	 Avg Test Loss: 0.38995159
Test fold 1 	 Epoch [ 70] 	 Avg Train Loss: 0.23759217 	 Avg Val Loss: 0.43385056 	 Avg Test Loss: 0.39511338
Test fold 1 	 Epoch [ 80] 	 Avg Train Loss: 0.23073565 	 Avg Val Loss: 0.42392989 	 Avg Test Loss: 0.37668638
Early stop