In [1]:
!pip install wfdb matplotlib pandas numpy

Collecting wfdb
  Using cached wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting aiohttp>=3.10.11 (from wfdb)
  Using cached aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl.metadata (8.1 kB)
Collecting aiosignal>=1.4.0 (from aiohttp>=3.10.11->wfdb)
  Using cached aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Using cached wfdb-4.3.0-py3-none-any.whl (163 kB)
Using cached aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl (499 kB)
Using cached aiosignal-1.4.0-py3-none-any.whl (7.5 kB)
Installing collected packages: aiosignal, aiohttp, wfdb
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [wfdb]━━━━━━[0m [32m1/3[0m [aiohttp]
[1A[2KSuccessfully installed aiohttp-3.13.3 aiosignal-1.4.0 wfdb-4.3.0


In [3]:
import os

def record_id_to_path(base_dir, record_id, resolution="lr"):
    """
    base_dir: np. "ptb-xl/records100"
    record_id: int lub str, np. 1 albo "00001"
    resolution: "lr" or "hr"
    """
    rid = int(record_id)
    folder = f"{(rid // 1000) * 1000:05d}"
    fname = f"{rid:05d}_{resolution}"
    full_path = os.path.join(base_dir, folder, fname)
    return full_path

In [4]:
import wfdb

def load_ecg_signal(base_dir, record_id, resolution="lr"):
    record_path = record_id_to_path(base_dir, record_id, resolution)
    record = wfdb.rdrecord(record_path)
    signal = record.p_signal   # shape: (n_samples, 12)
    fs = record.fs             # sampling rate
    leads = record.sig_name    # nazwy odprowadzeń
    return signal, fs, leads

In [5]:
import matplotlib.pyplot as plt
import numpy as np

def plot_ecg_image(signal, fs, leads, out_path,
                   seconds=10, dpi=150):
    """
    signal: (n_samples, 12)
    """
    n_samples = int(seconds * fs)
    signal = signal[:n_samples]

    time = np.arange(signal.shape[0]) / fs

    fig, axes = plt.subplots(12, 1, figsize=(10, 14), sharex=True)

    for i in range(12):
        axes[i].plot(time, signal[:, i], linewidth=0.8)
        axes[i].set_ylabel(leads[i], rotation=0, labelpad=20)
        axes[i].grid(True, linestyle="--", alpha=0.3)

    axes[-1].set_xlabel("Time (s)")
    fig.suptitle("12-lead ECG", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plt.savefig(out_path, dpi=dpi)
    plt.close()

In [6]:
def generate_ecg_png(base_dir, record_id, out_dir,
                     resolution="lr", seconds=10):
    os.makedirs(out_dir, exist_ok=True)

    signal, fs, leads = load_ecg_signal(base_dir, record_id, resolution)

    out_path = os.path.join(out_dir, f"{int(record_id):05d}.png")
    plot_ecg_image(signal, fs, leads, out_path, seconds=seconds)

    return out_path

In [8]:
BASE_DIR = "ecg/records100"
OUT_DIR = "ecg_images"

png_path = generate_ecg_png(
    base_dir=BASE_DIR,
    record_id=1,
    out_dir=OUT_DIR,
    resolution="lr",
    seconds=10
)

print("Saved:", png_path)

Saved: ecg_images/00001.png


In [9]:
import pandas as pd

df = pd.read_csv("ptbxl_database.csv")

# mapowanie PTB-XL → triage
def map_to_triage(scp_codes):
    scp = scp_codes.lower()
    if "mi" in scp:
        return "RED"
    if "sttc" in scp or "cd" in scp or "arr" in scp:
        return "YELLOW"
    return "GREEN"

df["triage"] = df["scp_codes"].apply(map_to_triage)

sample_df = pd.concat([
    df[df.triage=="GREEN"].sample(70, random_state=42),
    df[df.triage=="YELLOW"].sample(70, random_state=42),
    df[df.triage=="RED"].sample(60, random_state=42),
])

ids = sample_df.ecg_id.tolist()
print(len(ids)) 

200


In [11]:
for id_ in ids:
    
    png_path = generate_ecg_png(
    base_dir=BASE_DIR,
    record_id=id_,
    out_dir=OUT_DIR,
    resolution="lr",
    seconds=10
    )

    print("Saved:", png_path)

Saved: ecg_images/02950.png
Saved: ecg_images/21319.png
Saved: ecg_images/02070.png
Saved: ecg_images/21437.png
Saved: ecg_images/16163.png
Saved: ecg_images/08283.png
Saved: ecg_images/02100.png
Saved: ecg_images/14693.png
Saved: ecg_images/10906.png
Saved: ecg_images/02732.png
Saved: ecg_images/19659.png
Saved: ecg_images/11351.png
Saved: ecg_images/13325.png
Saved: ecg_images/20482.png
Saved: ecg_images/17724.png
Saved: ecg_images/19818.png
Saved: ecg_images/00993.png
Saved: ecg_images/11092.png
Saved: ecg_images/12529.png
Saved: ecg_images/20129.png
Saved: ecg_images/14067.png
Saved: ecg_images/07407.png
Saved: ecg_images/04582.png
Saved: ecg_images/04247.png
Saved: ecg_images/03928.png
Saved: ecg_images/10534.png
Saved: ecg_images/04998.png
Saved: ecg_images/01131.png
Saved: ecg_images/03285.png
Saved: ecg_images/16750.png
Saved: ecg_images/08691.png
Saved: ecg_images/20463.png
Saved: ecg_images/21334.png
Saved: ecg_images/08117.png
Saved: ecg_images/06758.png
Saved: ecg_images/02

In [14]:
sample_df.groupby('triage').count()

Unnamed: 0_level_0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
triage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GREEN,70,70,70,70,25,34,64,70,70,70,...,70,2,8,3,0,5,0,70,70,70
RED,60,60,60,60,13,14,53,60,60,60,...,60,3,11,1,1,11,0,60,60,60
YELLOW,70,70,70,70,20,31,67,70,70,70,...,70,3,11,2,0,9,0,70,70,70


In [15]:
def build_core_dataset_df(sample_df, image_dir="ecg_images"):
    df = sample_df.copy()

    # ujednolicenie nazw
    df = df.rename(columns={
        "ecg_id": "ecg_id",
        "super_class": "ptbxl_superclass",
        "scp_codes": "scp_codes",
        "triage": "triage_label"
    })

    # ścieżka do obrazu (jeszcze nie istnieje fizycznie)
    df["image_path"] = df["ecg_id"].apply(
        lambda x: os.path.join(image_dir, f"{int(x):05d}.png")
    )

    # placeholdery na kolejne etapy pipeline
    df["ecg_summary"] = ""
    df["symptoms"] = ""
    df["triage_target_json"] = ""

    # sort dla powtarzalności
    df = df.sort_values("ecg_id").reset_index(drop=True)

    return df

In [16]:
core_df = build_core_dataset_df(sample_df, image_dir="ecg_images")

core_df.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,triage_label,image_path,ecg_summary,symptoms,triage_target_json
0,146,13447.0,45.0,0,182.0,90.0,,3.0,CS-12 E,1986-02-23 11:41:43,...,,,7,records100/00000/00146_lr,records500/00000/00146_hr,RED,ecg_images/00146.png,,,
1,177,21551.0,73.0,0,,,,3.0,AT-6 C,1986-03-15 08:11:15,...,,,4,records100/00000/00177_lr,records500/00000/00177_hr,RED,ecg_images/00177.png,,,
2,223,16039.0,82.0,0,,,1.0,2.0,CS-12,1986-06-08 17:24:13,...,,,9,records100/00000/00223_lr,records500/00000/00223_hr,RED,ecg_images/00223.png,,,
3,544,13772.0,67.0,0,,,,3.0,AT-6 C,1987-03-13 16:32:49,...,,,9,records100/00000/00544_lr,records500/00000/00544_hr,RED,ecg_images/00544.png,,,
4,631,5043.0,83.0,0,183.0,70.0,11.0,1.0,AT-6 6,1987-04-17 10:10:03,...,1ES,,4,records100/00000/00631_lr,records500/00000/00631_hr,GREEN,ecg_images/00631.png,,,


In [19]:
core_df.to_csv("core_db.csv", index=False)