<a href="https://colab.research.google.com/github/kiril-buga/Neural-Network-Training-Project/blob/main/Deeplearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load the dataset from the shared Google Drive

In [14]:
# ===== Install required libraries (run once per session) =====
!pip install wfdb pandas==2.2.2



In [15]:

# ===== Imports =====
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import wfdb

import matplotlib.pyplot as plt # for plotting

# ===== Detect if running in Google Colab and mount Drive =====
IN_COLAB = False
try:
    from google.colab import drive  # type: ignore
    IN_COLAB = True
except Exception:
    drive = None
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive/')

# ===== Define paths =====
if IN_COLAB:
    # Case 1: You manually placed the dataset in MyDrive
    DATA_PATH = "/content/drive/MyDrive/DeepLearningECG/data/"
    ARTIFACT_DIR = "/content/drive/MyDrive/DeepLearningECG/artifacts/"

else:
    # Case 3: Local fallback (if running outside Colab)
    DATA_PATH = "../data/"
    ARTIFACT_DIR = "../artifacts/"

print("DATA_PATH:", DATA_PATH)
print("ARTIFACT_DIR:", ARTIFACT_DIR)
print("Files in DATA_PATH:", os.listdir(DATA_PATH))

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
DATA_PATH: /content/drive/MyDrive/DeepLearningECG/data/
ARTIFACT_DIR: /content/drive/MyDrive/DeepLearningECG/artifacts/
Files in DATA_PATH: ['ECGCode.csv', 'DiseaseCode.csv', 'ExampleReadingCode.ipynb', 'AttributesDictionary.csv', 'Child_ecg.zip', 'Child_ecg.z01', 'Child_ecg']


2. ECG Waveform and Diagnosis Loader (WFDB-based)

In [16]:
# Path where the WFDB ECG files (.hea/.dat) live.
ECG_PATH = os.path.join(DATA_PATH, "Child_ecg")

# Load the raw ECG signal data
def load_raw_data(df, path):
    filenames = df['Filename']
    data = [wfdb.rdsamp(path + f)[0] for f in filenames]
    return data

# Load diagnostic comments from WFDB metadata
def load_Diag(df, path):
    Dissess_Diag = []
    ECG_Diag = []
    for filename in df['Filename']:
        record = wfdb.rdrecord(path + filename)
        message = record.comments
        Dissess_Diag.append(message[1])
        ECG_Diag.append(message[2])
    return Dissess_Diag, ECG_Diag


3. Load the CSV metadata

In [17]:
# ===== Load CSV metadata =====
ATTR_PATH = os.path.join(DATA_PATH, "AttributesDictionary.csv")
DISEASE_PATH = os.path.join(DATA_PATH, "DiseaseCode.csv")
ECGCODE_PATH = os.path.join(DATA_PATH, "ECGCode.csv")

df_attr = pd.read_csv(ATTR_PATH)
df_disease = pd.read_csv(DISEASE_PATH)
df_ecgcode = pd.read_csv(ECGCODE_PATH)

print("Attributes shape:", df_attr.shape)
display(df_attr.head())
print("DiseaseCode shape:", df_disease.shape)
display(df_disease.head())
print("ECGCode shape:", df_ecgcode.shape)
display(df_ecgcode.head())

Attributes shape: (14190, 14)


Unnamed: 0,Filename,ECG_ID,Patient_ID,Age,Gender,Acquisition_date,Sampling_point,Lead,AHA_code,CHN_code,ICD-10 code,pSQI,basSQI,bSQI
0,P00/P00001/P00001_E01,P00001_E01,P00001,572d,'Female',2017-11-22 10:46:08,9000,9,'Left ventricular high voltage';'L147','J106';'L123','I34.0';'Q21.0';'Q24.9','I':0.288;'II':0.323;'III':0.346;'aVR':0.312;'...,'I':0.994;'II':0.996;'III':0.991;'aVR':0.997;'...,'I':1.000;'II':1.000;'III':1.000;'aVR':1.000;'...
1,P00/P00002/P00002_E01,P00002_E01,P00002,4327d,'Male',2017-11-28 21:59:47,15000,12,'C21','C13','I51.4';'J18.9','I':0.472;'II':0.446;'III':0.449;'aVR':0.484;'...,'I':0.995;'II':0.980;'III':0.992;'aVR':0.992;'...,'I':1.000;'II':1.000;'III':1.000;'aVR':1.000;'...
2,P00/P00003/P00003_E01,P00003_E01,P00003,1087d,'Female',2017-11-29 16:04:57,10000,12,'C21','C13','Q21.0';'Q24.9','I':0.495;'II':0.347;'III':0.340;'aVR':0.382;'...,'I':0.915;'II':0.895;'III':0.882;'aVR':0.908;'...,'I':1.000;'II':1.000;'III':1.000;'aVR':1.000;'...
3,P00/P00004/P00004_E01,P00004_E01,P00004,2465d,'Male',2017-11-30 15:21:27,13000,9,'C21','C13','Q21.1';'Q24.9','I':0.340;'II':0.405;'III':0.409;'aVR':0.350;'...,'I':0.981;'II':0.988;'III':0.974;'aVR':0.986;'...,'I':1.000;'II':1.000;'III':1.000;'aVR':1.000;'...
4,P00/P00004/P00004_E02,P00004_E02,P00004,2461d,'Male',2017-11-26 19:19:48,15000,9,'A1','A1','Q21.1';'Q24.9','I':0.501;'II':0.494;'III':0.389;'aVR':0.525;'...,'I':0.993;'II':0.993;'III':0.989;'aVR':0.995;'...,'I':1.000;'II':1.000;'III':1.000;'aVR':1.000;'...


DiseaseCode shape: (20, 4)


Unnamed: 0,Disease Type,Disease Category,ICD-10 Code,ICD-10 Description
0,Myocarditis,Fulminant myocarditis,(F) I40.0,Infective myocarditis;Septic myocarditis;Use a...
1,Myocarditis,Viral myocarditis,(V) I40.0,Infective myocarditis;Septic myocarditis;Use a...
2,Myocarditis,Acute myocarditis,I40.9,"Acute myocarditis, unspecified"
3,Myocarditis,Myocarditis,I51.4,"Myocarditis, unspecified;Myocardial fibrosis;M..."
4,Cardiomyopathy,Dilated cardiomyopathy,I42.0,Dilated cardiomyopathy;Congestive cardiomyopathy


ECGCode shape: (105, 3)


Unnamed: 0,Description,AHA(Category&Code),CHN(Category&Code)
0,Normal ECG,A1,A1
1,Otherwise normal ECG,A2,A2
2,Sinus tachycardia,C21,C13
3,Sinus bradycardia,C22,C14
4,Sinus arrhythmia,C23,C15


____________ BIS HIER FUNKTIONIERT ES ____________________

In [21]:
import matplotlib.pyplot as plt
# load row signal data
X = load_raw_data(df_attr, ECG_PATH) # load all ECG signals
data = X[0].T # take first record, transpose -> (leads, samples)
for i in range(data.shape[0]):
    plt.plot(data[i]) # plot each lead on the same figure

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/DeepLearningECG/data/Child_ecgP00/P00001/P00001_E01.hea'

In [None]:
import warnings
warnings.filterwarnings("ignore")
from sqi_score.ecg_qc.ecg_qc import EcgQc  # https://github.com/Aura-healthcare/ecg_qc
from biosppy.signals import ecg
import neurokit2 as nk
def get_rpeaks(signal, sample_rate=100, method="nabian2018"):
    _, info = nk.ecg_peaks(signal, sampling_rate=sample_rate, method=method, correct_artifacts=False)
    rpeaks = info['ECG_R_Peaks']
    return rpeaks
sqi = []
bsqi = []
for item in range(data.shape[0]):
    rpeaks1 = get_rpeaks(data[item], sample_rate=100, method='nabian2018')
    rpeaks2 = get_rpeaks(data[item], sample_rate=100, method='promac')
    bSQI = ecg.bSQI(rpeaks1,rpeaks2,fs=100)
    bsqi.append(bSQI)
    ecg_list = data[item].tolist()
    flattened_ecg = np.ravel(ecg_list)
    warnings.filterwarnings("ignore")
    ecg_qc = EcgQc('rfc_norm_2s.pkl', sampling_frequency=500, normalized=True)
    sqi_scores = np.array(ecg_qc.compute_sqi_scores(flattened_ecg))
    sqi.append(sqi_scores[0])
sqi = np.array(sqi)
bsqi = np.array(bsqi)
mean_sqi = np.mean(sqi,axis=0)
mean_bsqi = np.mean(bsqi,axis=0)
print('The quality of signal: pSQI = {:.3f}, basSQI = {:.3f}, bSQI = {:.3f}'.format(mean_sqi[4], mean_sqi[5], mean_bsqi))

In [None]:
# load Disease_Diag and ECG_Diag
Disease_Diag, ECG_Diag = load_Diag(Y, path)
Disease_Diag, ECG_Diag

In [None]:
# Split data into train val and test
# Get unique Patient_ID values
patient_ids = Y['Patient_ID'].unique()

# Split Patient_IDs into training, validation, and test sets
train_ids, test_ids = train_test_split(patient_ids, test_size=0.2, random_state=42)
train_ids, val_ids = train_test_split(train_ids, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Get indices for training, validation, and test sets based on Patient_ID
train_idx = Y[Y['Patient_ID'].isin(train_ids)].index
val_idx = Y[Y['Patient_ID'].isin(val_ids)].index
test_idx = Y[Y['Patient_ID'].isin(test_ids)].index

# Split X into training, validation, and test sets based on the indices
# Train
X_train = [X[i] for i in train_idx]
Y_train = [Disease_Diag[i] for i in train_idx]
# val
X_val = [X[i] for i in val_idx]
Y_val = [Disease_Diag[i] for i in val_idx]
# test
X_test = [X[i] for i in test_idx]
Y_test = [Disease_Diag[i] for i in test_idx]
print(f'The training set has {len(X_train)} records')
print(f'The val set has {len(X_val)} records')
print(f'The test set has {len(X_test)} records')