In [13]:
import torch
import torch.nn as nn

class TimeSeriesTransformer(nn.Module):
    def __init__(self, num_features, d_model, nhead, num_encoder_layers, dim_feedforward, dropout, n_classes):
        super(TimeSeriesTransformer, self).__init__()

        # Input projection from num_features to d_model
        self.input_projection = nn.Linear(num_features, d_model)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.zeros(1, 187, d_model))  # Assuming max seq_len of 1000

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Output MLP for classification
        self.fc = nn.Sequential(
            nn.Linear(d_model, 128),
            nn.ReLU(),
            nn.Linear(128, n_classes)
        )

    def forward(self, x):
        # Input: (batch_size, seq_len, num_features)

        batch_size, seq_len, _ = x.size()

        # Input projection (embedding)
        x = self.input_projection(x)  # (batch_size, seq_len, d_model)

        # Add positional encoding
        x = x + self.positional_encoding[:, :seq_len, :]

        # Transpose for Transformer (needed by PyTorch's nn.Transformer)
        x = x.transpose(0, 1)  # (seq_len, batch_size, d_model)

        # Transformer Encoder
        x = self.transformer_encoder(x)  # (seq_len, batch_size, d_model)

        # Pooling: Take the mean over the sequence dimension
        x = x.mean(dim=0)  # (batch_size, d_model)

        # Classification MLP
        out = self.fc(x)  # (batch_size, n_classes)

        return out

# Example usage:
# Parameters
num_features = 1     # Number of features in the time series
seq_len = 187        # Length of each time series sequence
d_model = 200        # Transformer model dimension
nhead = 2            # Number of attention heads
num_encoder_layers = 1 # Number of Transformer encoder layers
dim_feedforward = 128 # Feedforward dimension
dropout = 0.1        # Dropout rate
n_classes = 5        # Number of output classes
device = "cuda" if torch.cuda.is_available() else "cpu"


## Datasets and dataloader classes for training, validation and testing
I asked our friend Chad about this one too, giving him input with an excerpt from the training data file for information

In [15]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split
import pandas as pd

df_train = pd.read_csv('data/mitbih_train.csv')
df_test = pd.read_csv('data/mitbih_test.csv')

# Custom Dataset class for time series data
class TimeSeriesDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        # Add another dimension at the end if we later on wants to work on multivariate time-series
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.features = self.features.unsqueeze(-1)  

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

batch_size = 100

#### TEST data #####
# Separate features and labels
labels = df_test.iloc[:, -1].values     # Last column as the label
features = df_test.iloc[:, :-1].values  # All columns except the last one

# Create Dataset and DataLoader for testing
dataset = TimeSeriesDataset(features, labels)
test_loader = DataLoader(dataset, batch_size, shuffle=True)

#### TRAIN/VALIDATION data ####
# Separate features and labels
labels = df_train.iloc[:, -1].values     # Last column as the label
features = df_train.iloc[:, :-1].values  # All columns except the last one

# Create Dataset and DataLoader
dataset = TimeSeriesDataset(features, labels)

# Dataset splitting for training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size, shuffle=False)

for batch_idx, (inputs, targets) in enumerate(train_loader):
    print(f"Batch {batch_idx+1}:")
    print(f"Inputs: {inputs.shape}")
    print(f"Targets: {targets.shape}")
    break  # Just to show the first batch


Batch 1:
Inputs: torch.Size([100, 187, 1])
Targets: torch.Size([100])


## Training
Now, running through all epochs and batches

In [18]:
import torch.optim as optim

# Initialize model, loss function, and optimizer
model = TimeSeriesTransformer(num_features, d_model, nhead, num_encoder_layers, dim_feedforward, dropout, n_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model = model.to(device)

# Training and Validation Loop
n_epochs = 20

for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}/{n_epochs}')

    # Training phase
    model.train()  # Set model to training mode
    running_train_loss = 0.0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        # Move data to device
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

    avg_train_loss = running_train_loss / len(train_loader)
    print(f'Training Loss: {avg_train_loss:.4f}')

    # Validation phase
    model.eval()  # Set model to evaluation mode
    running_val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation for validation
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()

            # Accuracy calculation
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = running_val_loss / len(val_loader)
    val_accuracy = correct / total
    print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    torch.save(model.state_dict(), 'myModel')
    


Epoch 1/20
Training Loss: 0.5255
Validation Loss: 0.4302, Validation Accuracy: 0.8786
Epoch 2/20
Training Loss: 0.3977
Validation Loss: 0.3600, Validation Accuracy: 0.8944
Epoch 3/20
Training Loss: 0.3556
Validation Loss: 0.3339, Validation Accuracy: 0.9043
Epoch 4/20
Training Loss: 0.3269
Validation Loss: 0.3295, Validation Accuracy: 0.9067
Epoch 5/20
Training Loss: 0.3050
Validation Loss: 0.2948, Validation Accuracy: 0.9142
Epoch 6/20
Training Loss: 0.2925
Validation Loss: 0.2825, Validation Accuracy: 0.9191
Epoch 7/20
Training Loss: 0.2824
Validation Loss: 0.3053, Validation Accuracy: 0.9182
Epoch 8/20
Training Loss: 0.2743
Validation Loss: 0.2644, Validation Accuracy: 0.9250
Epoch 9/20
Training Loss: 0.2621
Validation Loss: 0.2700, Validation Accuracy: 0.9232
Epoch 10/20
Training Loss: 0.2551
Validation Loss: 0.2534, Validation Accuracy: 0.9267
Epoch 11/20
Training Loss: 0.2443
Validation Loss: 0.2549, Validation Accuracy: 0.9263
Epoch 12/20
Training Loss: 0.2360
Validation Loss: 0