# Modeling #

## Import APIs ##

In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import wfdb
import ast

## Load data ##

### Metadata ###

In [31]:
ptbxl_df = pd.read_csv('./cleaned_data/cleaned_ptbxl_metadata.csv', index_col='ecg_id')

In [3]:
ptbxl_df.head()

Unnamed: 0_level_0,age,sex,device,validated_by_human,diagnostic_superclass,strat_fold,filename_lr,filename_hr
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,56.0,1,CS-12 E,True,['NORM'],3,records100/00000/00001_lr,records500/00000/00001_hr
2,19.0,0,CS-12 E,True,['NORM'],2,records100/00000/00002_lr,records500/00000/00002_hr
3,37.0,1,CS-12 E,True,['NORM'],5,records100/00000/00003_lr,records500/00000/00003_hr
4,24.0,0,CS-12 E,True,['NORM'],3,records100/00000/00004_lr,records500/00000/00004_hr
5,19.0,1,CS-12 E,True,['NORM'],4,records100/00000/00005_lr,records500/00000/00005_hr


In [32]:
metadata = ptbxl_df.loc[:, ['age', 'sex', 'device', 'validated_by_human']].copy()
metadata.head()

Unnamed: 0_level_0,age,sex,device,validated_by_human
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,56.0,1,CS-12 E,True
2,19.0,0,CS-12 E,True
3,37.0,1,CS-12 E,True
4,24.0,0,CS-12 E,True
5,19.0,1,CS-12 E,True


### Waveform data ###

In [33]:
waveform_data = []
for idx in ptbxl_df.index:
    record_path = ptbxl_df.loc[idx]['filename_hr']
    waveform_df = pd.read_csv('./cleaned_data/waveform_data/' + record_path + '.csv', index_col='Time (s)')
    waveform_data.append(waveform_df)
waveform_data = np.array(waveform_data)
waveform_data.shape

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [33]:
#np.save("./cleaned_data/waveform_np.npy", waveform_data)

In [34]:
waveform_data = np.load("./cleaned_data/waveform_np.npy")

## Create recommended train-test split ##

visualization (not completely accurate) of waveform_data
| Scaled Age | Male | Female | Device A | Device B | Validated True | Validated False |
|------------|------|--------|----------|----------|----------------|-----------------|
| -0.5       | 1    | 0      | 1        | 0        | 1              | 0               |
| 1.2        | 0    | 1      | 0        | 1        | 0              | 1               |
| -0.7       | 1    | 0      | 1        | 0        | 1              | 0               |


chatgpt rec
https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#column-transformer-with-mixed-types

This recommended train-test split code was obtained from the downloaded folder with the dataset: https://physionet.org/content/ptb-xl/1.0.3/.

In [35]:
#make tensors
transformers = [
    ('num', StandardScaler(), ['age']),
    ('cat', OneHotEncoder(), ['sex', 'device', 'validated_by_human'])
]

ct = ColumnTransformer(transformers, remainder='passthrough')
normalized_metadata = ct.fit_transform(metadata)

#make dense array
normalized_metadata  = normalized_metadata.toarray()

In [56]:
# Split data into train and test
test_fold = 10

# Train
waveform_train = waveform_data[np.where(ptbxl_df.strat_fold != test_fold)]
normalized_metadata_train = normalized_metadata[ptbxl_df.strat_fold != test_fold]
y_train = ptbxl_df[ptbxl_df.strat_fold != test_fold].diagnostic_superclass

print(waveform_train.shape)
print(normalized_metadata_train.shape)


# Test
waveform_test = waveform_data[np.where(ptbxl_df.strat_fold == test_fold)]
normalized_metadata_test = normalized_metadata[ptbxl_df.strat_fold == test_fold]
y_test = ptbxl_df[ptbxl_df.strat_fold == test_fold].diagnostic_superclass

(19601, 1000, 12)
(19601, 16)


## Normalize data ##

Only waveform train data is normalized here; may need to normalize other data too.

In [57]:
# Code generated from Bing Copilot

# Initialize a new array for the normalized data
normalized_waveform_train = np.empty_like(waveform_train)

# Iterate over each record
for i in range(waveform_train.shape[0]):
    # Iterate over each lead in the record
    for j in range(waveform_train.shape[2]):
        # Compute the minimum and maximum of the lead data
        min_val = np.min(waveform_train[i, :, j])
        max_val = np.max(waveform_train[i, :, j])

        # Check if max_val equals to min_val
        if max_val == min_val:
            # If they are equal, then all the values are the same in this lead
            # We can set the normalized values to 0 (or any constant value)
            normalized_waveform_train[i, :, j] = 0
        else:
            # Perform normalization
            normalized_waveform_train[i, :, j] = (waveform_train[i, :, j] - min_val) / (max_val - min_val)

In [42]:
normalized_waveform_train.shape

(19601, 1000, 12)

## Initialize train dataloader ##

In [58]:
#TODO: do y
normalized_waveform_train = torch.from_numpy(normalized_waveform_train).float() 
normalized_waveform_train = normalized_waveform_train.permute(0, 2, 1)
print(normalized_waveform_train.shape)


normalized_metadata_train = torch.tensor(normalized_metadata_train, dtype=torch.float32)
print(normalized_metadata_train.shape)


waveform_test = torch.tensor(waveform_test).float() 
waveform_test = waveform_test.permute(0, 2, 1)

metadata_test = torch.tensor(normalized_metadata_test, dtype=torch.float32)
print(metadata_test.shape)

torch.Size([19601, 12, 1000])
torch.Size([19601, 16])
torch.Size([2198, 16])


In [59]:
batch_size = 32
train_dataset = TensorDataset(normalized_waveform_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

## Modeling ##

### CNN Autoencoder ###

debugged with chatgpt (maxpool return indicies)

In [101]:
class CNNAutoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.conv1 = nn.Conv1d(in_channels=12, out_channels=32, kernel_size=5, stride=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2, return_indices=True)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5, stride=2)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2, return_indices=True)
        self.relu2 = nn.ReLU()

        # Decoder
        self.unpool1 = nn.MaxUnpool1d(kernel_size=2, stride=2)
        # Adjust output_padding for proper dimension match
        self.deconv1 = nn.ConvTranspose1d(in_channels=64, out_channels=32, kernel_size=5, stride=2, output_padding=1)
        self.relu3 = nn.ReLU()
        self.unpool2 = nn.MaxUnpool1d(kernel_size=2, stride=2)
        # Adjust output_padding for proper dimension match
        self.deconv2 = nn.ConvTranspose1d(in_channels=32, out_channels=12, kernel_size=5, stride=1, output_padding=0)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.conv1(x)
        x, indices1 = self.pool1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x, indices2 = self.pool2(x)
        x = self.relu2(x)

        x = self.unpool1(x, indices2)
        x = self.deconv1(x)
        x = self.relu3(x)
        x = self.unpool2(x, indices1)
        x = self.deconv2(x)
        x = self.sigmoid(x)
        return x


In [102]:
cnn_autoencoder_model = CNNAutoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cnn_autoencoder_model.parameters(), lr=0.001)

In [103]:
def train_model_cnn(model, train_loader, criterion, optimizer, nepoch=20):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on device: {device}")
    model.to(device)
    
    for epoch in range(nepoch):
        total_loss = 0
        for batch in train_loader:
            waveforms = batch[0].to(device)
            optimizer.zero_grad()
            outputs = model(waveforms)
            loss = criterion(outputs, waveforms)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{nepoch}, Loss: {total_loss/len(train_loader)}')
    
    print("Training complete.")

In [104]:
# Train the model
train_model_cnn(cnn_autoencoder_model, train_loader, criterion, optimizer)

Training on device: cuda


RuntimeError: Expected shape of indices to be: [32, 64, 123, 1] but got: [32, 32, 498, 1]

### TCN Autoencoder ###

Model Card for the Hybrid Autoencoder
Model Name: Hybrid Autoencoder for ECG and Metadata

Description: This model is designed to learn compressed representations of combined ECG waveform and patient metadata. It utilizes separate pathways for waveform data and metadata, merging them into a dense representation which is then used to reconstruct both types of data.

Model Architecture:

Waveform Pathway: Convolutional layers followed by pooling and flattening.
Metadata Pathway: Dense layers.
Combined Encoding and Decoding: Dense layers.
Intended Use: Intended for anomaly detection in ECG data where additional patient metadata is available and considered relevant.

Data Used for Training: Assumes a dataset comprising ECG waveform data aligned with patient metadata such as age, sex, and device information.

Limitations: The model's effectiveness is highly dependent on the quality and preprocessing of the input data. The architecture needs fine-tuning and validation using real-world data to ensure robustness.

Ethical Considerations: Care should be taken to avoid biases that may arise from imbalanced data across different demographic groups. Privacy concerns should be addressed when handling patient data.

This framework sets up the foundation of your model; further tuning, training, and validation steps are needed to adapt it to specific tasks or datasets.

In [None]:
from pytorch_tcn import TCN

class TCNAutoencoder(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size, dropout, metadata_dims):
        super(TCNAutoencoder, self).__init__()
        self.encoder = TCN(
            num_inputs=num_inputs,
            num_channels=num_channels,
            kernel_size=kernel_size,
            dropout=dropout,
            causal=True,
        )
        self.age_embedding = nn.Embedding(120, metadata_dims[0])  # Assuming age range from 0 to 119
        self.sex_embedding = nn.Embedding(2, metadata_dims[1])  # Assuming sex is binary (0 or 1)
        self.device_embedding = nn.Embedding(num_devices, metadata_dims[2])  # num_devices is the number of unique devices
        
        decoder_input_dim = num_channels[-1] + sum(metadata_dims)
        self.decoder = TCN(
            num_inputs=decoder_input_dim,
            num_channels=num_channels[::-1],
            kernel_size=kernel_size,
            dropout=dropout,    
            causal=True,
            output_projection=num_inputs,
        )
        
    def forward(self, x, age, sex, device):
        encoded = self.encoder(x)
        
        age_emb = self.age_embedding(age)
        sex_emb = self.sex_embedding(sex)
        device_emb = self.device_embedding(device)
        
        metadata_emb = torch.cat([age_emb, sex_emb, device_emb], dim=-1)
        metadata_emb = metadata_emb.unsqueeze(2).expand(-1, -1, encoded.size(2))
        
        concatenated = torch.cat([encoded, metadata_emb], dim=1)
        decoded = self.decoder(concatenated)
        return decoded