In [1]:
import pandas as pd
import lzma
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
file_path = '../sequence_data/detailed/profiles.csv.xz'
with lzma.open(file_path, 'rt') as file:
    signal_df = pd.read_csv(file)

seqs = tuple(signal_df.groupby('sequenceID'))[0:1]

In [3]:
sequences = [torch.tensor(seq[1]['signal'].to_numpy(), dtype=torch.float32) for seq in seqs]

In [4]:
# Create a Dataset class
class SequenceDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx]

# Create a DataLoader
dataset = SequenceDataset(sequences)
data_loader = DataLoader(dataset, batch_size=2, collate_fn=lambda x: pad_sequence(x, batch_first=True))

In [5]:
# Define Encoder
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        _, (h_n, _) = self.rnn(x)
        return h_n[-1]  # Take the last hidden state

# Define Decoder
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, seq_length):
        # Repeat the context vector for each time step in the output
        x = x.unsqueeze(1).repeat(1, seq_length, 1)  # Shape: (batch_size, seq_length, hidden_size)
        output, _ = self.rnn(x)
        return self.fc(output)

# Define the full model
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_size, hidden_size)
        self.decoder = Decoder(hidden_size, output_size)

    def forward(self, x):
        seq_length = x.size(1)  # Get the length of the input sequence
        hidden = self.encoder(x)
        output = self.decoder(hidden, seq_length)
        return output

In [6]:
# Hyperparameters
input_size = 1   # Since your signal is one-dimensional
hidden_size = 2048  # The size of the vector for each sequence
output_size = 1  # Output size should match the input size

# Initialize the model, loss function, and optimizer
model = Seq2Seq(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Early stopping parameters
patience = 20  # Number of epochs to wait for improvement
best_loss = float('inf')
patience_counter = 0

# Training loop
num_epochs = 10000
for epoch in range(num_epochs):
    for batch in data_loader:
        batch = batch.unsqueeze(-1)  # Add input dimension
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch)  # Compute loss
        loss.backward()
        optimizer.step()
    
    # Check for early stopping
    if loss.item() < best_loss:
        best_loss = loss.item()
        patience_counter = 0  # Reset patience counter
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.8f} - Saving model...')
        # You can save the model here using `torch.save(model.state_dict(), 'model.pth')`
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        print(f'Early stopping at epoch {epoch + 1} with loss: {best_loss:.8f}')
        break

Epoch [1/10000], Loss: 0.00949133 - Saving model...
Epoch [16/10000], Loss: 0.00947811 - Saving model...
Early stopping at epoch 36 with loss: 0.00947811


In [7]:
# Testing the trained model
def test_model(model, test_sequence):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient computation
        # Prepare the input sequence (add batch dimension)
        input_seq = test_sequence.unsqueeze(0).unsqueeze(-1)  # Shape: (1, seq_length, 1)
        # Get the output from the model
        output_seq = model(input_seq)
    return output_seq

# Example test sequence
test_sequence = sequences[0]

# Run the test
output_sequence = test_model(model, test_sequence)

# Print results
print("Input sequence: ", test_sequence.numpy()[:5])
print("Output sequence:", output_sequence.squeeze(0).numpy()[:,0][:5])

Input sequence:  [-0.02473668 -0.06642736  0.09491165  0.0071955   0.07587487]
Output sequence: [-0.01005963  0.01573802  0.01126526  0.0032569   0.00902159]
