In [8]:
import pandas as pd
import lzma
import torch
import torch.nn as nn
import numpy as np
import random

In [9]:
dataset = 'detailed'

In [10]:
# Hinged Square Loss
class SquaredHingeLoss(nn.Module):
    def __init__(self, margin=1):
        super(SquaredHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, predicted, y):
        low, high = y[:, 0:1], y[:, 1:2]
        loss_low = torch.relu(low - predicted + self.margin)
        loss_high = torch.relu(predicted - high + self.margin)
        loss = loss_low + loss_high
        return torch.mean(torch.square(loss))

In [11]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Output layer

    def forward(self, x):               # x shape: (batch_size, seq_length, input_size)
        lstm_out, _ = self.lstm(x)      # lstm_out shape: (batch_size, seq_length, hidden_size)
        last_out = lstm_out[:, -1, :]   # last_out shape: (batch_size, hidden_size)
        x = self.fc(last_out)           # x shape: (batch_size, 1)
        return x

In [12]:
# Function to test the model
def test_model(model, inputs):
    model.eval()
    predictions = []

    with torch.no_grad():
        for seq_input in inputs:
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)
            output_seq = model(seq_input)
            predictions.append(output_seq.item())

    return predictions

In [13]:
file_path = f'../../sequence_data/{dataset}/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))

In [None]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
    # Split data into training and test sets
    train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
    test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

    # train sequences
    train_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(train_ids)]

    # test sequences
    test_seqs = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs if seq[0] in list(test_ids)]
    
    # train target
    target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]
    y_train = torch.tensor(target_df_train.iloc[:, 1:].to_numpy())

    # Initialize the model, loss function, and optimizer
    model = LSTMModel(1, 16, 2)
    criterion = SquaredHingeLoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Initialize variables for best loss and best model
    best_loss = float('inf')    # Set best_loss to infinity initially
    patience_counter = 0        # Counter for early stopping
    patience = 50               # Number of epochs to wait for improvement before stopping
    best_model_state = None     # Variable to store the best model parameters

    # Training loop
    for epoch in range(1000):
        combined = list(zip(train_seqs, y_train))   # Combine sequences and targets
        random.shuffle(combined)                    # Shuffle both train_seqs and y_train together
        train_seqs, y_train = zip(*combined)        # Unzip them back into separate lists
        
        total_loss = 0
        for i, seq_input in enumerate(train_seqs):
            # Prepare input and target for the current sequence
            target = y_train[i].unsqueeze(0)  # Get the corresponding target for the sequence
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            seq_input = seq_input.unsqueeze(0).unsqueeze(-1)    # Shape: (1, seq_length, input_size)
            output_seq = model(seq_input)                       # Get model output for the sequence
            loss = criterion(output_seq, target.unsqueeze(-1))  # Calculate loss
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Calculate average loss for this epoch
        average_loss = total_loss / len(train_seqs)
        if epoch % 20 == 0:
            print(f'Test fold {test_fold}, Epoch [{epoch}], Average Loss: {average_loss:.8f}')
        
        # Early stopping logic
        if average_loss < best_loss:
            best_loss = average_loss  # Update best loss
            patience_counter = 0  # Reset patience counter
            
            # Save the best model parameters in memory
            best_model_state = model.state_dict()  # Store the model parameters
        else:
            patience_counter += 1  # Increment patience counter
            
        # Stop training if patience is exceeded
        if patience_counter >= patience:
            print(f'Early stopping triggered after {epoch + 1} epochs.')
            break
    
    # After training, you can restore the best model parameters if needed
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model.eval()  # Set the model to evaluation mode
    
    # Test the model and collect outputs
    pred_lldas = test_model(model, test_seqs)

    # Save to CSV
    lldas_df = pd.DataFrame(list(zip(test_ids, pred_lldas)), columns=['sequenceID', 'llda'])
    lldas_df.to_csv(f'predictions/proposed.{dataset}.{test_fold}.csv', index=False)