In [1]:
import pandas as pd
import lzma
import torch
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [2]:
dataset = 'detailed'
test_fold = 1

nonlinearity='relu'
hidden_size = 8
num_layers = 2

max_epoch = 1000
patience = 50

In [3]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [4]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, nonlinearity=nonlinearity)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, lengths):
        # x is PackedSequence
        packed_output, hidden = self.rnn(x)
        # Get the last hidden state from the last layer
        last_hidden = hidden[-1]  # Shape: (batch_size, hidden_size)
        x = self.fc(last_hidden)
        x = torch.relu(x + 10) - torch.relu(x - 10) - 10
        return x

In [5]:
# Custom Dataset
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

In [6]:
# Collate function for DataLoader
def collate_fn(batch):
    sequences, targets = zip(*batch)
    # Get lengths of each sequence
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    # Pad sequences to the same length
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
    # Stack targets
    targets = torch.stack(targets)
    return sequences_padded.unsqueeze(-1), targets, lengths

In [7]:
# Function to compute loss value
def get_loss_value(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    total_samples = 0
    with torch.no_grad():
        for seq_inputs, targets, lengths in data_loader:
            seq_inputs_packed = torch.nn.utils.rnn.pack_padded_sequence(seq_inputs, lengths, batch_first=True, enforce_sorted=False)
            outputs = model(seq_inputs_packed, lengths)
            loss = criterion(outputs, targets.unsqueeze(-1))
            batch_size = seq_inputs.size(0)
            total_loss += loss.item() * batch_size
            total_samples += batch_size
    average_loss = total_loss / total_samples
    return average_loss

In [8]:
# Function to test the model
def test_model(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for seq_inputs, _, lengths in data_loader:
            seq_inputs_packed = torch.nn.utils.rnn.pack_padded_sequence(seq_inputs, lengths, batch_first=True, enforce_sorted=False)
            outputs = model(seq_inputs_packed, lengths)
            predictions.extend(outputs.squeeze(-1).tolist())
    return predictions

In [9]:
# Load data
file_path = f'../../sequence_data/{dataset}/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

# Split data into training and test sets
train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

# Train sequences
train_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(train_ids)]

# Test sequences
test_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(test_ids)]

# Targets
target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]
target_df_test = target_df[target_df['sequenceID'].isin(test_ids)]
y_train = torch.tensor(target_df_train.iloc[:, 1:].to_numpy(), dtype=torch.float32)
y_test = torch.tensor(target_df_test.iloc[:, 1:].to_numpy(), dtype=torch.float32)

# Split train into subtrain and validation (80% subtrain, 20% validation)
train_seqs, val_seqs, y_train, y_val = train_test_split(train_seqs, y_train, test_size=0.2, random_state=42)

# Create datasets and dataloaders
batch_size = 64

train_dataset = SequenceDataset(train_seqs, y_train)
val_dataset = SequenceDataset(val_seqs, y_val)
test_dataset = SequenceDataset(test_seqs, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [10]:
# Initialize the model, loss function, and optimizer
model = RNNModel(1, hidden_size, num_layers)
criterion = SquaredHingeLoss()
optimizer = torch.optim.Adam(model.parameters())

# Initialize variables for best validation loss and best model
best_val_loss = float('inf')    # Set best validation loss to infinity initially
patience_counter = 0            # Counter for early stopping
best_model_state = None         # Variable to store the best model parameters

# Training loop
for epoch in range(max_epoch):
    model.train()
    total_loss = 0
    total_samples = 0

    for seq_inputs, targets, lengths in train_loader:
        optimizer.zero_grad()
        seq_inputs_packed = torch.nn.utils.rnn.pack_padded_sequence(seq_inputs, lengths, batch_first=True, enforce_sorted=False)
        outputs = model(seq_inputs_packed, lengths)
        loss = criterion(outputs, targets.unsqueeze(-1))

        # Stop training if loss is NaN
        if torch.isnan(loss):
            print(f'Stopping training at epoch {epoch} due to NaN loss.')
            break

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        batch_size = seq_inputs.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size

    # Calculate average loss for this epoch (training)
    average_loss = total_loss / total_samples

    # Validation step
    average_val_loss  = get_loss_value(model, val_loader, criterion)
    average_test_loss = get_loss_value(model, test_loader, criterion)

    if epoch % 1 == 0:
        print(f'Test fold {test_fold} \t Epoch [{epoch:3d}] \t Train Loss: {average_loss:.8f} \t Val Loss: {average_val_loss:.8f} \t Test Loss: {average_test_loss:.8f}')

    # Early stopping logic based on validation loss
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss  # Update best validation loss
        patience_counter = 0  # Reset patience counter

        # Save the best model parameters in memory
        best_model_state = model.state_dict()  # Store the model parameters
    else:
        patience_counter += 1  # Increment patience counter

    # Stop training if patience is exceeded or NaN is detected
    if patience_counter >= patience or torch.isnan(loss):
        if torch.isnan(loss):
            print(f'Training stopped due to NaN loss at epoch {epoch}.')
        else:
            print(f'Early stopping triggered after {epoch + 1} epochs.')
        break

# After training, you can restore the best model parameters if needed
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    model.eval()  # Set the model to evaluation mode

# Test the model and collect outputs
pred_lldas = test_model(model, test_loader)

# Save to CSV
lldas_df = pd.DataFrame(list(zip(test_ids, pred_lldas)), columns=['sequenceID', 'llda'])
lldas_df.to_csv(f'predictions/proposed.{dataset}.{hidden_size}.{num_layers}.{nonlinearity}.{patience}.{test_fold}.csv', index=False)

Test fold 1 	 Epoch [  0] 	 Train Loss: 0.49186422 	 Val Loss: 0.49797019 	 Test Loss: 0.47026852
Test fold 1 	 Epoch [  1] 	 Train Loss: 0.46137233 	 Val Loss: 0.47267466 	 Test Loss: 0.45940787
Test fold 1 	 Epoch [  2] 	 Train Loss: 0.44998411 	 Val Loss: 0.46306698 	 Test Loss: 0.46127726
Test fold 1 	 Epoch [  3] 	 Train Loss: 0.44781604 	 Val Loss: 0.46370550 	 Test Loss: 0.46049200
Test fold 1 	 Epoch [  4] 	 Train Loss: 0.44895272 	 Val Loss: 0.46373084 	 Test Loss: 0.46036127
Test fold 1 	 Epoch [  5] 	 Train Loss: 0.44869851 	 Val Loss: 0.46319897 	 Test Loss: 0.46083957
Test fold 1 	 Epoch [  6] 	 Train Loss: 0.44789864 	 Val Loss: 0.46275606 	 Test Loss: 0.46141661
Test fold 1 	 Epoch [  7] 	 Train Loss: 0.44825847 	 Val Loss: 0.46402535 	 Test Loss: 0.46001744
Test fold 1 	 Epoch [  8] 	 Train Loss: 0.44774062 	 Val Loss: 0.46303700 	 Test Loss: 0.46092446
Test fold 1 	 Epoch [  9] 	 Train Loss: 0.44784771 	 Val Loss: 0.46285608 	 Test Loss: 0.46111336
Test fold 1 	 Epoch 