# Interpreting ECGs Using LLMs and ML

## 0. Importing the Data

To download one patient segment's ECG signal from the Icentia11k ECG dataset, I used the following command in terminal: 
```bash
wget -r -N -c -np https://physionet.org/files/icentia11k-continuous-ecg/1.0/p00/p00000/
```

## 1. Data Loading and Preprocessing

In [1]:
# import necessary packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wfdb
import neurokit2 as nk

In [2]:
# adding path to python scripts
import sys
sys.path.append("src")

# python script containing methods to extract relevant clinical features from an ECG signal
from ecg_processor import ECGProcessor

In [3]:
# folder containing all the ECG signals from the Icentia11k dataset
path = "physionet.org/files/icentia11k-continuous-ecg/1.0"

# load in one ECG signal from one patient
processor = ECGProcessor(data_path=path, patient_id=0, segment_id=20)

  mrrs /= th2
  mrrs /= th2


In [4]:
# segment the ECG signal into heartbeats
beats = processor.segment_by_beats()
beats["1"]

Unnamed: 0,ECG_Raw,ECG_Clean,ECG_Rate,ECG_Quality,ECG_R_Peaks,ECG_P_Peaks,ECG_P_Onsets,ECG_P_Offsets,ECG_Q_Peaks,ECG_R_Onsets,...,ECG_S_Peaks,ECG_T_Peaks,ECG_T_Onsets,ECG_T_Offsets,ECG_Phase_Atrial,ECG_Phase_Completion_Atrial,ECG_Phase_Ventricular,ECG_Phase_Completion_Ventricular,Index,Label
-0.200000,-0.134472,-0.263427,101.321191,0.634454,0,0,0,0,0,0,...,0,0,0,0,,0.000000,,0.000000,34,1
-0.195973,-0.121025,-0.260753,101.321191,0.634454,0,0,0,0,0,0,...,0,0,0,0,,0.000000,,0.000000,35,1
-0.191946,-0.121025,-0.257546,101.321191,0.634454,0,0,0,0,0,0,...,0,0,0,0,,0.000000,,0.000000,36,1
-0.187919,-0.121025,-0.254344,101.321191,0.634454,0,0,0,0,0,0,...,0,0,0,0,,0.000000,,0.000000,37,1
-0.183893,-0.121025,-0.251685,101.321191,0.634454,0,0,0,0,0,0,...,0,0,0,0,,0.000000,,0.000000,38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.383893,-0.121025,-0.088032,106.517060,0.739873,0,0,1,0,0,0,...,0,0,0,0,0.0,0.935185,0.0,0.464789,179,1
0.387919,-0.121025,-0.085442,106.547155,0.740636,0,0,0,0,0,0,...,0,0,0,0,0.0,0.944444,0.0,0.478873,180,1
0.391946,-0.107578,-0.078020,106.576594,0.741395,0,0,0,0,0,0,...,0,0,0,0,0.0,0.953704,0.0,0.492958,181,1
0.395973,-0.080683,-0.066840,106.605370,0.742147,0,0,0,0,0,0,...,0,0,0,0,0.0,0.962963,0.0,0.507042,182,1


## 2. Feature Extraction and Machine Learning Modelling

In [5]:
# analyze heart rate variability metrics
beat_analysis = processor.analyze_beats()
beat_analysis

Unnamed: 0,ECG_Rate_Mean,HRV_SDNN,HRV_RMSSD,HRV_pNN50,HRV_SD1,HRV_SampEn
0,101.325478,[[42.46137677892019]],[[12.259400331319126]],[[0.6918949449308105]],[[8.66931695130756]],[[0.48534839953746406]]


In [13]:
beat_analysis.to_dict()

{'ECG_Rate_Mean': {0: 101.32547776957566},
 'HRV_SDNN': {0: array([[42.46137678]])},
 'HRV_RMSSD': {0: array([[12.25940033]])},
 'HRV_pNN50': {0: array([[0.69189494]])},
 'HRV_SD1': {0: array([[8.66931695]])},
 'HRV_SampEn': {0: array([[0.4853484]])}}

In [6]:
# collect all ECG signals
ecg_list = []
for seg, df in beats.items():
    ecg_signal = df["ECG_Clean"].values
    ecg_list.append(ecg_signal)

# convert into a 3D array where each feature vector represent a heartbeat 
ecg_array = np.array(ecg_list)[..., np.newaxis]
ecg_array.shape

(5835, 150, 1)

In [7]:
# convert beat types to integers
label_mapping = {'N': 0, 'S': 1, 'V': 2}

# convert letters to integers classes
labels_int = np.array([label_mapping[l] for l in processor.labels])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [None]:
num_timesteps = ecg_array.shape[1]
num_features = ecg_array.shape[2]

# prepare labels for model
labels_onehot = to_categorical(labels_int, num_classes=3)

model = Sequential([
    LSTM(32, input_shape=(num_timesteps, num_features)),
    Dense(3, activation="softmax")  # beat classification (normal, PVC, PAC)
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

# train model
model.fit(ecg_array, labels_onehot, epochs=5, batch_size=16)

In [None]:
# get probabilities for each class
beat_label_probs = model.predict(ecg_array)

# get predicted class index
beat_pred = p.argmax(beat_label_probs, axis=1)

In [16]:
# calculate % of each type of heartbeat
beat_distribution = pd.Series(beat_pred).value_counts()/len(beat_pred)*100

# replace integer classes with name of heartbeat 
int2beat = {0: "Normal contraction", 1: "ESSV (PAC) contraction", 2: "ESV (PVC) contraction"}
beat_distribution.index = beat_distribution.index.map(int2beat)
beat_distribution = beat_distribution.to_dict()
beat_distribution

{'Normal contraction': 50.0,
 'ESSV (PAC) contraction': 25.0,
 'ESV (PVC) contraction': 25.0}

## 3. Large Language Model Interpretation

In [None]:
import torch
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# load the bloom model (doesn't require key)
model_name = "bigscience/bloom-560m"
text_generator = pipeline("text-generation", model=model_name)

# create prompt for model
prompt = f"""
You are a cardiology assistant. 
Based on the following patient ECG and heart rate variability (HRV) features:

Features:
{beat_analysis}

Predicted beat distribution (% of each type):
{beat_distribution}

Explain to a clinician in simple terms:
- Key insights about the patient's arrhythmia
- Any patterns or risks the clinician should be aware of
- Suggestions for further investigation or monitoring
"""


# generate health insights
output = text_generator(
    prompt,
    max_length=300,    # total length including prompt
    num_return_sequences=1)

print(output[0]["generated_text"])