<a href="https://colab.research.google.com/github/kiril-buga/Neural-Network-Training-Project/blob/main/1D_CNN_Multilabel_V2/Y_Preprocessing_10sWindow_OneHot_4Classes_v02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ECG Preprocessing to Windows using Smote

In [2]:
!pip install wfdb neurokit2 scikit-learn scipy matplotlib pandas numpy huggingface-hub h5py imbalanced-learn -q
!apt-get update && apt-get install -y p7zip-full

import os
import gc
import json
import numpy as np
import pandas as pd
import wfdb
import h5py
from scipy.signal import butter, filtfilt, welch, resample
import neurokit2 as nk
from datetime import datetime

np.random.seed(42)
print("✓ Dependencies installed")
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 https://cli.github.com/packages stable InRelease
Hit:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acq

## Environment Setup

In [3]:
from huggingface_hub import snapshot_download

print("Downloading preprocessed data from Hugging Face...")
local_dir = snapshot_download(
    repo_id="kiril-buga/ECG-database",
    repo_type="dataset",
    local_dir="./ECG-database/",
    allow_patterns="data/*"
)



DATA_PATH = f"{local_dir}/data/"
ARTIFACT_DIR = f"{local_dir}/artifacts/"



ECG_DIR = os.path.join(DATA_PATH, "Child_ecg/")
OUT_DIR = os.path.join(ARTIFACT_DIR, "multilabel_v4")
os.makedirs(OUT_DIR, exist_ok=True)

print(f"DATA: {DATA_PATH}")
print(f"OUTPUT: {OUT_DIR}")

Downloading preprocessed data from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

DATA: /content/ECG-database/data/
OUTPUT: /content/ECG-database/artifacts/multilabel_v4


In [4]:
zip_path = os.path.join(DATA_PATH, "Child_ecg.zip")

if os.path.exists(zip_path):
    # -aos : Skip extracting of existing files
    !7z x "{zip_path}" -o"{DATA_PATH}" -aos
    print("✓ Extraction check complete (existing files were skipped).")
else:
    print(f"Error: {zip_path} not found.")



7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/ECG-database/data/                                     1 file, 1908237306 bytes (1820 MiB)

Extracting archive: /content/ECG-database/data/Child_ecg.zip
 10% 4096 Open              --
Path = /content/ECG-database/data/Child_ecg.zip
Type = zip
Physical Size = 1908237306
Embedded Stub Size = 4
Total Physical Size = 4055720954
Multivolume = +
Volume Index = 1
Volumes = 2

  0%      1% 121          2% 224          3% 314          3% 396          4% 464          5% 538          6% 606          6% 693          7%

## Load CSV Metadata

In [5]:
csv_path = os.path.join(DATA_PATH, 'AttributesDictionary.csv')

if os.path.exists(csv_path):
    df_attr = pd.read_csv(csv_path)
    print(f"Loaded CSV: {df_attr.shape}")
else:
    from huggingface_hub import hf_hub_download
    print("Downloading CSV from Hugging Face...")
    csv_file = hf_hub_download(
        repo_id="kiril-buga/ECG-database",
        filename="AttributesDictionary.csv",
        repo_type="dataset"
    )
    df_attr = pd.read_csv(csv_file)
    print(f"Loaded CSV: {df_attr.shape}")

Loaded CSV: (14190, 14)


## Signal Processing Functions

In [6]:
def apply_bandpass(x, fs, lowcut=0.5, highcut=40.0):
    if x.ndim == 1:
        x = x[:, None]
    nyq = 0.5 * fs
    b, a = butter(4, [lowcut/nyq, highcut/nyq], btype="band")
    return np.column_stack([filtfilt(b, a, x[:, i]) for i in range(x.shape[1])])

def band_power(f, Pxx, fmin, fmax):
    mask = (f >= fmin) & (f <= fmax)
    return np.trapezoid(Pxx[mask], f[mask]) if np.any(mask) else 0.0

HAS_NK = True
try:
    import neurokit2
except:
    HAS_NK = False

def compute_qc(sig, meta, pSQI_mean, bSQI_mean):
    """Compute QC metrics."""
    qc = {"pSQI_mean": pSQI_mean, "bSQI_mean": bSQI_mean}

    fs = meta.get("fs", None)
    if fs is None:
        fs = getattr(meta, "fs", None)
    if fs is None:
        raise ValueError("Missing fs")

    if sig.ndim == 1:
        sig = sig[:, None]

    n_samples, n_leads = sig.shape
    qc["n_samples"] = int(n_samples)
    qc["n_leads"] = int(n_leads)
    qc["duration_sec"] = n_samples / fs

    lead = sig[:, 0]
    n_nans = np.isnan(lead).sum()
    qc["nan_fraction"] = float(n_nans / len(lead))

    lead_clean = lead.copy()
    if n_nans > 0:
        not_nan = ~np.isnan(lead_clean)
        if not np.any(not_nan):
            return {**qc, "qc_pass": False, "fail_reason": "all_nan"}
        lead_clean[~not_nan] = np.interp(np.flatnonzero(~not_nan),
                                          np.flatnonzero(not_nan), lead_clean[not_nan])

    amp = lead_clean
    qc["amp_mean"] = float(np.mean(amp))
    qc["amp_std"] = float(np.std(amp))
    q1, q99 = np.percentile(amp, [1, 99])
    qc["amp_robust_range"] = float(q99 - q1)

    f, Pxx = welch(amp, fs=fs, nperseg=min(4096, len(amp)))
    qc["baseline_wander_ratio"] = band_power(f, Pxx, 0.0, 0.5) / (band_power(f, Pxx, 0.5, 40.0) + 1e-8)
    qc["powerline_ratio"] = band_power(f, Pxx, 48.0, 52.0) / (band_power(f, Pxx, 40.0, 60.0) + 1e-8)

    reasons = []
    if qc["duration_sec"] < 8.0: reasons.append("too_short")
    if qc["nan_fraction"] > 0.01: reasons.append("too_many_nans")
    if not (0.05 < qc["amp_robust_range"] < 10.0): reasons.append("amp_out_of_range")
    if qc["baseline_wander_ratio"] > 0.5: reasons.append("baseline_wander")
    if qc["powerline_ratio"] > 0.5: reasons.append("powerline_noise")
    if pSQI_mean < 0.2: reasons.append("low_pSQI")
    if bSQI_mean < 0.8: reasons.append("low_bSQI")

    qc["qc_pass"] = len(reasons) == 0
    qc["fail_reason"] = ";".join(reasons) if reasons else ""

    return qc

print("✓ Processing functions defined")

✓ Processing functions defined


## Preprocessing & Windowing

In [7]:
def preprocess_record(sig, meta, target_fs=500.0):
    fs = meta.get("fs", None) or getattr(meta, "fs", None)
    if sig.ndim == 1:
        sig = sig[:, None]
    sig_bp = apply_bandpass(sig, fs=fs)
    if fs == target_fs:
        return sig_bp, fs
    n_samples = sig_bp.shape[0]
    n_new = int(round(n_samples / fs * target_fs))
    sig_res = np.column_stack([resample(sig_bp[:, i], n_new) for i in range(sig_bp.shape[1])])
    return sig_res, target_fs

def window_record(sig, fs, window_sec=10.0, step_sec=5.0, target_samples=None):
    """
    Create windows from preprocessed ECG signal.
    Pads/truncates to fixed sample length to ensure consistent shapes.
    Parameters:
    - sig: (n_samples, n_leads) signal array
    - fs: sampling frequency (Hz)
    - window_sec: window duration in seconds
    - step_sec: step duration in seconds
    - target_samples: target number of samples per window (default: window_sec * fs)
    Returns:
    - List of windows with shape (target_samples, n_leads)
    """
    if sig.ndim == 1:
        sig = sig[:, None]
    n_samples = sig.shape[0]
    n_leads = sig.shape[1]
    win_len = int(window_sec * fs)
    step_len = int(step_sec * fs)
    if target_samples is None:
        target_samples = win_len
    windows = []
    start = 0
    while start + win_len <= n_samples:
        segment = sig[start:start + win_len, :]
        # Skip windows with too many NaNs
        if np.isnan(segment).mean() > 0.05:
            start += step_len
            continue
        # Normalize each channel
        seg_norm = segment.copy()
        for ch in range(seg_norm.shape[1]):
            x = seg_norm[:, ch]
            m, s = np.nanmean(x), np.nanstd(x)
            seg_norm[:, ch] = (x - m) / (s if s > 1e-6 else 1.0)
        # Pad or truncate to target_samples
        if seg_norm.shape[0] < target_samples:
            pad_len = target_samples - seg_norm.shape[0]
            seg_norm = np.pad(seg_norm, ((0, pad_len), (0, 0)), mode='constant', constant_values=0)
        elif seg_norm.shape[0] > target_samples:
            seg_norm = seg_norm[:target_samples, :]
        windows.append(seg_norm.astype(np.float32))
        start += step_len
    return windows

print("✓ Preprocessing functions defined")

✓ Preprocessing functions defined


## ICD Code Parsing & Disease Mapping

In [8]:
ICD_TO_DISEASE = {
    'I40.0': 'Myocarditis', 'I40.9': 'Myocarditis', 'I41.4': 'Myocarditis',
    'I42.0': 'Cardiomyopathy', 'I42.2': 'Cardiomyopathy', 'I42.9': 'Cardiomyopathy', 'Q28.4': 'Cardiomyopathy',
    'M30.3': 'Kawasaki',
    'Q21.1': 'CHD', 'Q21.2': 'CHD', 'Q21.3': 'CHD', 'Q22.1': 'CHD', 'Q25.0': 'CHD', 'Q25.6': 'CHD', 'I27.9': 'CHD',
}

DISEASE_CLASSES = ['Myocarditis', 'Cardiomyopathy', 'Kawasaki', 'CHD', 'Healthy']
CLASS_IDX = {c: i for i, c in enumerate(DISEASE_CLASSES)}

def parse_icd(s):
    if pd.isna(s):
        return []
    return [p.strip().replace("'", "") for p in str(s).split(";") if p.strip()]

def clean_icd(code):
    if pd.isna(code):
        return None
    code_str = str(code).strip()
    if ')' in code_str:
        code_str = code_str.split(')')[-1].strip()
    return code_str or None

def parse_sqi(s):
    if pd.isna(s):
        return {}
    out = {}
    for item in str(s).split(";"):
        if ":" in item:
            k, v = item.split(":")
            try:
                out[k.replace("'", "").strip()] = float(v)
            except:
                pass
    return out

# Parse ICD codes
df_attr["ICD_list"] = df_attr["ICD-10 code"].apply(parse_icd)
df_attr["ICD_primary"] = df_attr["ICD_list"].apply(lambda x: x[0] if x else None)
df_attr["ICD_primary_clean"] = df_attr["ICD_primary"].apply(clean_icd)
df_attr["disease"] = df_attr["ICD_primary_clean"].apply(lambda x: ICD_TO_DISEASE.get(x, 'Healthy') if x else 'Healthy')

# Parse SQI
for col in ["pSQI", "basSQI", "bSQI"]:
    df_attr[f"{col}_dict"] = df_attr[col].apply(parse_sqi)
    df_attr[f"{col}_mean"] = df_attr[f"{col}_dict"].apply(lambda d: np.mean(list(d.values())) if d else np.nan)

print("Disease distribution:")
print(df_attr['disease'].value_counts())

Disease distribution:
disease
Healthy           12452
CHD                1173
Myocarditis         440
Kawasaki             67
Cardiomyopathy       58
Name: count, dtype: int64


## Main Processing Pipeline

In [25]:
def process_windows_to_hdf5(df, ecg_dir, max_records=None, target_samples=5000, target_channels=12, output_file=None):
    """
    Step 1: Process ECG records and save windows to HDF5 (no SMOTE yet).
    """
    if output_file is None:
        output_file = os.path.join(OUT_DIR, "ecg_windows_raw.h5")

    print("Step 1: Processing and saving windows to HDF5...")

    X_list = []
    y_list = []
    qc_list = []

    iterator = df.iloc[:max_records].iterrows() if max_records else df.iterrows()
    total_records = max_records if max_records else len(df)
    window_count = 0

    for idx, row in iterator:
        if (idx + 1) % 100 == 0:
            print(f"  Processing [{idx + 1}/{total_records}] - Windows: {window_count}")

        fname = row["Filename"]
        disease = row["disease"]
        path = os.path.join(ecg_dir, fname)

        try:
            sig, meta = wfdb.rdsamp(path)
        except Exception as e:
            qc_list.append({"Filename": fname, "disease": disease, "qc_pass": False, "fail_reason": str(e)})
            continue

        meta_dict = meta if isinstance(meta, dict) else meta.__dict__
        sig = np.asarray(sig)

        qc = compute_qc(sig, meta_dict, float(row["pSQI_mean"]), float(row["bSQI_mean"]))
        qc["Filename"] = fname
        qc["disease"] = disease
        qc["original_channels"] = sig.shape[1]

        if not qc["qc_pass"]:
            qc_list.append(qc)
            continue

        sig_proc, fs = preprocess_record(sig, meta_dict)
        windows = window_record(sig_proc, fs, target_samples=target_samples)

        for window in windows:
            if window.shape[1] < target_channels:
                pad_channels = target_channels - window.shape[1]
                window = np.pad(window, ((0, 0), (0, pad_channels)), mode='constant', constant_values=0)
            elif window.shape[1] > target_channels:
                window = window[:, :target_channels]

            X_list.append(window)
            label = np.zeros(len(DISEASE_CLASSES), dtype=np.int32)
            label[CLASS_IDX[disease]] = 1
            y_list.append(label)
            window_count += 1

            if len(X_list) >= 10000:
                print(f"    Batch checkpoint at {window_count} windows...")
                if not os.path.exists(output_file):
                    with h5py.File(output_file, 'w') as h5f:
                        h5f.create_dataset('X', data=np.array(X_list, dtype=np.float32),
                                         maxshape=(None, target_samples, target_channels), dtype=np.float32)
                        h5f.create_dataset('y', data=np.array(y_list, dtype=np.int32),
                                         maxshape=(None, len(DISEASE_CLASSES)), dtype=np.int32)
                else:
                    with h5py.File(output_file, 'a') as h5f:
                        h5f['X'].resize((h5f['X'].shape[0] + len(X_list)), axis=0)
                        h5f['X'][-len(X_list):] = np.array(X_list, dtype=np.float32)
                        h5f['y'].resize((h5f['y'].shape[0] + len(y_list)), axis=0)
                        h5f['y'][-len(y_list):] = np.array(y_list, dtype=np.int32)
                X_list = []
                y_list = []

        qc["n_windows"] = len(windows)
        qc_list.append(qc)

    # Save remaining
    if X_list:
        print(f"  Saving final batch ({len(X_list)} windows)...")
        if not os.path.exists(output_file):
            with h5py.File(output_file, 'w') as h5f:
                h5f.create_dataset('X', data=np.array(X_list, dtype=np.float32), dtype=np.float32)
                h5f.create_dataset('y', data=np.array(y_list, dtype=np.int32), dtype=np.int32)
        else:
            with h5py.File(output_file, 'a') as h5f:
                h5f['X'].resize((h5f['X'].shape[0] + len(X_list)), axis=0)
                h5f['X'][-len(X_list):] = np.array(X_list, dtype=np.float32)
                h5f['y'].resize((h5f['y'].shape[0] + len(y_list)), axis=0)
                h5f['y'][-len(y_list):] = np.array(y_list, dtype=np.int32)

    print(f"\nDone! Total windows: {window_count}")
    print(f"Saved to: {output_file}")
    return output_file, pd.DataFrame(qc_list)


# Run window generation
RAW_H5_FILE = os.path.join(OUT_DIR, "ecg_windows_raw.h5")
print("Processing ECG records...")
raw_h5_file, df_qc = process_windows_to_hdf5(df_attr, ECG_DIR, max_records=None, output_file=RAW_H5_FILE)

# Quick verification
with h5py.File(raw_h5_file, 'r') as h5f:
    print(f"\nRaw dataset: X={h5f['X'].shape}, y={h5f['y'].shape}")
    for i, cls in enumerate(DISEASE_CLASSES):
        count = h5f['y'][:, i].sum()
        print(f"  {cls}: {count}")

Processing ECG records...
Step 1: Processing and saving windows to HDF5...
  Saving final batch (40 windows)...

Done! Total windows: 40
Saved to: /content/ECG-database/artifacts/multilabel_v4/ecg_windows_raw.h5

Raw dataset: X=(40, 5000, 12), y=(40, 5)
  Myocarditis: 0
  Cardiomyopathy: 0
  Kawasaki: 0
  CHD: 15
  Healthy: 25


In [19]:
def apply_smote_and_split(raw_h5_file, output_file=None, target_samples_per_class=None, chunk_size=2000):
    """
    Memory-efficient: single pass through raw file, distributes to splits.
    """
    if output_file is None:
        output_file = os.path.join(OUT_DIR, "ecg_data_smote.h5")
    if target_samples_per_class is None:
        target_samples_per_class = {0: 1011, 1: 1011, 2: 1011, 3: None, 4: 2000}

    print("="*60)
    print("Step 1: Loading labels and creating splits...")
    print("="*60)

    with h5py.File(raw_h5_file, 'r') as h5f:
        total_windows = h5f['X'].shape[0]
        n_timesteps, n_channels = h5f['X'].shape[1], h5f['X'].shape[2]
        y_all = h5f['y'][:]

    print(f"  Total windows: {total_windows}")

    y_class = y_all.argmax(axis=1)
    indices = np.arange(total_windows)

    print("  Creating train/val/test splits...")
    idx_temp, idx_test = train_test_split(indices, test_size=0.1, stratify=y_class, random_state=42)
    idx_train, idx_val = train_test_split(idx_temp, test_size=0.111, stratify=y_all[idx_temp].argmax(axis=1), random_state=42)

    # Create lookup: index -> split assignment
    split_assignment = np.full(total_windows, -1, dtype=np.int8)  # -1 = none
    split_assignment[idx_train] = 0  # train
    split_assignment[idx_val] = 1    # val
    split_assignment[idx_test] = 2   # test

    print(f"  Train: {len(idx_train)}, Val: {len(idx_val)}, Test: {len(idx_test)}")

    print("\n" + "="*60)
    print("Step 2: Analyzing class distribution for training set...")
    print("="*60)

    y_train = y_all[idx_train]
    keep_mask_train = np.ones(len(idx_train), dtype=bool)
    smote_classes = []

    for class_idx, cls_name in enumerate(DISEASE_CLASSES):
        target_count = target_samples_per_class.get(class_idx)
        class_mask = y_train[:, class_idx] == 1
        current_count = class_mask.sum()

        if target_count is None:
            print(f"  {cls_name}: {current_count} (keep as-is)")
        elif target_count > current_count:
            print(f"  {cls_name}: {current_count} -> {target_count} (SMOTE)")
            smote_classes.append((class_idx, cls_name, current_count, target_count))
        elif target_count < current_count:
            print(f"  {cls_name}: {current_count} -> {target_count} (undersample)")
            class_indices = np.where(class_mask)[0]
            np.random.seed(42)
            remove_indices = np.random.choice(class_indices, current_count - target_count, replace=False)
            keep_mask_train[remove_indices] = False
        else:
            print(f"  {cls_name}: {current_count} (at target)")

    # Mark which training indices to keep (after undersampling)
    idx_train_set = set(idx_train[keep_mask_train])
    idx_val_set = set(idx_val)
    idx_test_set = set(idx_test)

    n_train_kept = len(idx_train_set)
    n_synthetic = sum(tc - cc for _, _, cc, tc in smote_classes)
    n_final_train = n_train_kept + n_synthetic

    print(f"\n  Training kept: {n_train_kept}, Synthetic: {n_synthetic}, Final: {n_final_train}")

    del y_class, indices, idx_temp, keep_mask_train
    gc.collect()

    print("\n" + "="*60)
    print("Step 3: Single-pass copy to output file...")
    print("="*60)

    # Create output file
    with h5py.File(output_file, 'w') as h5f:
        h5f.create_dataset('X_train', shape=(n_final_train, n_timesteps, n_channels), dtype=np.float32)
        h5f.create_dataset('y_train', shape=(n_final_train, len(DISEASE_CLASSES)), dtype=np.int32)
        h5f.create_dataset('X_val', shape=(len(idx_val), n_timesteps, n_channels), dtype=np.float32)
        h5f.create_dataset('y_val', shape=(len(idx_val), len(DISEASE_CLASSES)), dtype=np.int32)
        h5f.create_dataset('X_test', shape=(len(idx_test), n_timesteps, n_channels), dtype=np.float32)
        h5f.create_dataset('y_test', shape=(len(idx_test), len(DISEASE_CLASSES)), dtype=np.int32)

    # Single pass through raw file
    train_ptr, val_ptr, test_ptr = 0, 0, 0

    with h5py.File(raw_h5_file, 'r') as h5_raw:
        with h5py.File(output_file, 'a') as h5_out:
            for start in range(0, total_windows, chunk_size):
                end = min(start + chunk_size, total_windows)

                # Read contiguous chunk (fast!)
                X_chunk = h5_raw['X'][start:end]
                y_chunk = h5_raw['y'][start:end]

                # Distribute to splits
                for i, global_idx in enumerate(range(start, end)):
                    if global_idx in idx_train_set:
                        h5_out['X_train'][train_ptr] = X_chunk[i]
                        h5_out['y_train'][train_ptr] = y_chunk[i]
                        train_ptr += 1
                    elif global_idx in idx_val_set:
                        h5_out['X_val'][val_ptr] = X_chunk[i]
                        h5_out['y_val'][val_ptr] = y_chunk[i]
                        val_ptr += 1
                    elif global_idx in idx_test_set:
                        h5_out['X_test'][test_ptr] = X_chunk[i]
                        h5_out['y_test'][test_ptr] = y_chunk[i]
                        test_ptr += 1

                print(f"  Progress: {end}/{total_windows} | Train: {train_ptr}, Val: {val_ptr}, Test: {test_ptr}", end='\r')
            print()

    print(f"  Done! Train: {train_ptr}, Val: {val_ptr}, Test: {test_ptr}")

    del idx_train_set, idx_val_set, idx_test_set
    gc.collect()

    print("\n" + "="*60)
    print("Step 4: Applying SMOTE for minority classes...")
    print("="*60)

    write_ptr = train_ptr  # Continue after kept samples

    if smote_classes:
        # Need to reload training data for SMOTE classes
        idx_train_sorted = np.sort(idx_train)

        with h5py.File(raw_h5_file, 'r') as h5_raw:
            for class_idx, cls_name, current_count, target_count in smote_classes:
                print(f"\n  Processing {cls_name}...")

                # Get indices for this class
                class_mask = y_train[:, class_idx] == 1
                class_train_indices = idx_train_sorted[np.sort(np.where(class_mask)[0])]

                # Load class data via contiguous reads where possible
                print(f"    Loading {len(class_train_indices)} samples...")
                X_class = h5_raw['X'][class_train_indices]  # Sorted indices = faster

                print(f"    Applying SMOTE...")
                X_flat = X_class.reshape(X_class.shape[0], -1)

                n_neighbors = min(5, current_count - 1)
                smote = SMOTE(sampling_strategy={1: target_count}, k_neighbors=n_neighbors, random_state=42)
                X_with_dummy = np.vstack([X_flat, X_flat[:1]])
                y_with_dummy = np.array([1] * current_count + [0])

                X_smote, _ = smote.fit_resample(X_with_dummy, y_with_dummy)
                X_synthetic = X_smote[current_count:-1].reshape(-1, n_timesteps, n_channels)
                n_synthetic_cls = X_synthetic.shape[0]

                del X_flat, X_class, X_with_dummy, y_with_dummy, X_smote
                gc.collect()

                print(f"    Generated {n_synthetic_cls} synthetic samples, saving...")

                y_synthetic = np.zeros((n_synthetic_cls, len(DISEASE_CLASSES)), dtype=np.int32)
                y_synthetic[:, class_idx] = 1

                with h5py.File(output_file, 'a') as h5f:
                    h5f['X_train'][write_ptr:write_ptr + n_synthetic_cls] = X_synthetic
                    h5f['y_train'][write_ptr:write_ptr + n_synthetic_cls] = y_synthetic
                    write_ptr += n_synthetic_cls

                del X_synthetic, y_synthetic
                gc.collect()

                print(f"    Done. Total written: {write_ptr}")

    print("\n" + "="*60)
    print("Step 5: Adding metadata...")
    print("="*60)

    with h5py.File(output_file, 'a') as h5f:
        h5f.attrs['smote_applied'] = True
        h5f.attrs['disease_classes'] = DISEASE_CLASSES
        h5f.attrs['train_samples'] = n_final_train
        h5f.attrs['val_samples'] = len(idx_val)
        h5f.attrs['test_samples'] = len(idx_test)

    print(f"  Saved to: {output_file}")
    print("\n" + "="*60)
    print("COMPLETE!")
    print("="*60)

    return output_file


# Run
RAW_H5_FILE = os.path.join(OUT_DIR, "ecg_data_smote_temp.h5")
FINAL_H5_FILE = os.path.join(OUT_DIR, "ecg_data_smote.h5")
target_samples = {0: None, 1: 1500, 2: 1500, 3: 2500, 4: 2500}

h5_file = apply_smote_and_split(RAW_H5_FILE, output_file=FINAL_H5_FILE,
                                 target_samples_per_class=target_samples, chunk_size=2000)

# Verification
print("\nFINAL DATASET:")
with h5py.File(h5_file, 'r') as h5f:
    print(f"  X_train: {h5f['X_train'].shape}")
    print(f"  X_val: {h5f['X_val'].shape}")
    print(f"  X_test: {h5f['X_test'].shape}")
    print("\nTraining distribution:")
    for i, cls in enumerate(DISEASE_CLASSES):
        count = h5f['y_train'][:, i].sum()
        pct = (count / h5f['y_train'].shape[0]) * 100
        print(f"  {cls}: {count} ({pct:.1f}%)")


Step 1: Loading labels and creating splits...
  Total windows: 59944
  Creating train/val/test splits...
  Train: 47960, Val: 5989, Test: 5995

Step 2: Analyzing class distribution for training set...
  Myocarditis: 1661 (keep as-is)
  Cardiomyopathy: 213 -> 1500 (SMOTE)
  Kawasaki: 251 -> 1500 (SMOTE)
  CHD: 4232 -> 2500 (undersample)
  Healthy: 41603 -> 5000 (undersample)

  Training kept: 9625, Synthetic: 2536, Final: 12161

Step 3: Single-pass copy to output file...
  Progress: 59944/59944 | Train: 9625, Val: 5989, Test: 5995
  Done! Train: 9625, Val: 5989, Test: 5995

Step 4: Applying SMOTE for minority classes...

  Processing Cardiomyopathy...
    Loading 213 samples...
    Applying SMOTE...
    Generated 1287 synthetic samples, saving...
    Done. Total written: 10912

  Processing Kawasaki...
    Loading 251 samples...
    Applying SMOTE...
    Generated 1249 synthetic samples, saving...
    Done. Total written: 12161

Step 5: Adding metadata...
  Saved to: /content/ECG-databa

## Save Results

In [26]:
# Save QC summary and metadata
df_qc[['Filename', 'disease', 'qc_pass', 'n_windows', 'original_channels']].to_csv(
    os.path.join(OUT_DIR, "qc_summary.csv"), index=False
)

with open(os.path.join(OUT_DIR, "disease_classes.json"), "w") as f:
    json.dump({
        "classes": DISEASE_CLASSES,
        "class_idx": CLASS_IDX,
        "icd_map": ICD_TO_DISEASE,
        "data_format": "hdf5",
        "hdf5_files": {
            "raw": "ecg_windows_raw.h5",
            "balanced": "ecg_data_smote.h5"
        },
        "balancing": {
            "smote_classes": ["Cardiomyopathy", "Kawasaki"],
            "undersampled_classes": ["Healthy"],
            "target_samples": {
                "Myocarditis": None,
                "Cardiomyopathy": 1011,
                "Kawasaki": 1011,
                "CHD": None,
                "Healthy": 5000
            }
        }
    }, f, indent=2)

print(f"✓ Saved to {OUT_DIR}")
print(f"  - ecg_windows_raw.h5 (raw windows)")
print(f"  - ecg_data_smote.h5 (balanced train/val/test)")
print(f"  - qc_summary.csv")
print(f"  - disease_classes.json")

✓ Saved to /content/ECG-database/artifacts/multilabel_v4
  - ecg_windows_raw.h5 (raw windows)
  - ecg_data_smote.h5 (balanced train/val/test)
  - qc_summary.csv
  - disease_classes.json


## Upload to Hugging Face (Optional)

In [4]:
UPLOAD_TO_HF = True  # Set to True to upload

if UPLOAD_TO_HF:
    from huggingface_hub import HfApi, login

    print("Logging into Hugging Face...")
    login()

    api = HfApi()
    print("Uploading to HF...")
    api.upload_folder(
        folder_path=OUT_DIR,
        repo_id="Neural-Network-Project/ECG-database",
        repo_type="dataset",
        path_in_repo="multilabel_v5",
        commit_message="Multi-label preprocessed data with smote and undersampling "
    )
    print("✓ Uploaded to HF")
else:
    print("To upload: set UPLOAD_TO_HF=True and have HF write token")

Logging into Hugging Face...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading to HF...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...abel_v4/ecg_data_smote.h5:   0%|          | 12.5MB / 5.80GB            

  ...v4/ecg_data_smote_temp.h5:   0%|          |  655kB / 14.5GB            

  ...bel_v4/ecg_windows_raw.h5:  16%|#6        | 1.57MB / 9.60MB            

✓ Uploaded to HF
