In [1]:
%pip install neurokit2
import numpy as np
import neurokit2 as nk


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# --- Drop-in: record loader (patient-wise split) ---
import os, re, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm

def _index_signal_files(root):
    """Map filename stem -> full path, recursively (.csv/.CSV/.txt/.TXT)."""
    root = Path(root)
    base_to_path = {}
    for pat in ("*.csv","*.CSV","*.txt","*.TXT"):
        for p in root.rglob(pat):
            base_to_path[p.stem] = str(p)
    return base_to_path

def _read_signal_csv_any(path):
    """Return (T,) or (L,T) numeric array; transpose to (L,T) when needed."""
    try:
        df = pd.read_csv(path)
        num = df.select_dtypes(include=[np.number])
        if num.empty:
            raise ValueError
        arr = num.to_numpy()
    except Exception:
        df = pd.read_csv(path, header=None)
        num = df.select_dtypes(include=[np.number])
        arr = num.to_numpy()
    if arr.ndim == 1:
        return arr.astype(np.float32)
    T, C = arr.shape
    if T >= C:   # rows are time -> transpose to (L,T)
        arr = arr.T
    return arr.astype(np.float32)

def load_nguyen_dataset(label_csv, signal_dir, fs=1000, test_size=0.2, random_state=1337,
                        id_col="Id", label_col="Hyperglycemia"):
    """
    Returns: records_train, labels_train, records_val, labels_val
    - Patient-wise split using the prefix before '_' in Id (e.g., '003' from '003_1')
    """
    labels = pd.read_csv(label_csv)
    labels[id_col] = labels[id_col].astype(str)
    labels["patient"] = labels[id_col].apply(lambda s: re.split(r"[_\-]", s)[0])

    idx = _index_signal_files(signal_dir)
    labels["path"] = labels[id_col].map(idx.get)
    labels = labels.loc[labels["path"].notna()].reset_index(drop=True)
    if len(labels) == 0:
        raise FileNotFoundError(
            f"No signal files in '{signal_dir}' matched Id stems from '{label_csv}'. "
            "Expected files like '003_1.csv'."
        )

    y = labels[label_col].astype(str).str.lower().isin(["1","true","yes"]).astype(int).to_numpy()
    groups = labels["patient"].to_numpy()
    paths  = labels["path"].tolist()

    print("Reading ECG files ...")
    ecgs = [_read_signal_csv_any(p) for p in tqdm(paths)]

    # patient-wise split; try to ensure both classes appear in both sets
    N = len(ecgs); idxs = np.arange(N); ok = False
    for k in range(10):
        gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state+k)
        tr_idx, va_idx = next(gss.split(idxs, groups=groups))
        if len(tr_idx) and len(va_idx) and len(np.unique(y[tr_idx]))>=2 and len(np.unique(y[va_idx]))>=2:
            ok = True; break
    if not ok:
        gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state+99)
        tr_idx, va_idx = next(gss.split(idxs, groups=groups))

    records_train = [ecgs[i] for i in tr_idx]
    labels_train  = [int(y[i]) for i in tr_idx]
    records_val   = [ecgs[i] for i in va_idx]
    labels_val    = [int(y[i]) for i in va_idx]

    print(f"Loaded {len(records_train)} train / {len(records_val)} val "
          f"(positives: {sum(labels_train)} / {sum(labels_val)})")
    return records_train, labels_train, records_val, labels_val


In [3]:

def detect_rpeaks_neurokit(x, fs=1000):
    """
    x: 1-D numpy array (single lead) used for detection.
       If you have multi-lead, pass a good lead for R detection (e.g., lead II).
    returns: sorted R-peak sample indices (np.ndarray)
    """
    x_clean = nk.ecg_clean(x, sampling_rate=fs, method="neurokit")
    # ecg_peaks expects raw/cleaned 1D signal
    _, info = nk.ecg_peaks(x_clean, sampling_rate=fs)
    rpeaks = np.asarray(info.get("ECG_R_Peaks", []), dtype=int)
    return np.sort(rpeaks)

def segment_beats(x, rpeaks, fs=1000, pre_ms=200, post_ms=400, discard_first_last=True):
    """
    x: (T,) or (L, T)
    rpeaks: np.ndarray of R-peak sample indices
    returns: list of beat segments (same shape as x but time-cropped to [R-pre, R+post])
             and (start,end) indices kept
    """
    if rpeaks is None or len(rpeaks) == 0:
        return [], []
    if discard_first_last and len(rpeaks) >= 2:
        rpeaks = rpeaks[1:-1]

    w_pre  = int(round(fs * pre_ms  / 1000.0))
    w_post = int(round(fs * post_ms / 1000.0))
    T = x.shape[-1]
    beats, spans = [], []
    for r in rpeaks:
        s = r - w_pre
        e = r + w_post
        if s < 0 or e > T:
            continue
        seg = x[..., s:e]     # supports (T,) or (L,T)
        beats.append(seg.copy())
        spans.append((s, e))
    return beats, spans


In [4]:
from torch.utils.data import Dataset
import torch

class ECGBeatsImageDataset(Dataset):
    """
    Builds beat-centered spectrogram images per record.
    - records: list of np.ndarrays, each (T,) or (L,T)
    - labels:  list of ints (0/1) per record (e.g., hyperglycemia)
    - detect_lead: which lead index to use for R detection if multi-lead
    - max_beats_per_record: cap to avoid over-representing long records
    """
    def __init__(self, records, labels, fs=1000, detect_lead=0,
                 pre_ms=200, post_ms=400, max_beats_per_record=20,
                 discard_first_last=True, leads_idx=(0,1,2)):
        assert len(records) == len(labels)
        self.fs = fs
        self.labels = labels
        self.items = []  # list of (record_idx, beat_segment_np)
        self.rec_index = []  # parallel list mapping to record idx (for eval aggregation)
        self.leads_idx = leads_idx

        for ri, sig in enumerate(records):
            sig = np.asarray(sig)
            # Choose 1 lead for detection
            if sig.ndim == 1:
                detect_sig = sig
            else:
                L = sig.shape[0]
                li = min(detect_lead, L-1)
                detect_sig = sig[li]

            rpeaks = detect_rpeaks_neurokit(detect_sig, fs=fs)
            beats, _ = segment_beats(sig, rpeaks, fs=fs, pre_ms=pre_ms, post_ms=post_ms,
                                     discard_first_last=discard_first_last)
            if len(beats) == 0:
                continue
            # Cap beats per record for balance
            if max_beats_per_record is not None and len(beats) > max_beats_per_record:
                sel = np.linspace(0, len(beats)-1, max_beats_per_record, dtype=int)
                beats = [beats[i] for i in sel]
            for b in beats:
                self.items.append((ri, b))
                self.rec_index.append(ri)

        self.y_per_record = np.asarray(labels, dtype=int)

    def __len__(self):
        return len(self.items)

    def __getitem__(self, i):
        ri, beat = self.items[i]
        # beat: (T,) or (L,T) -> image via your spectrogram
        img = ecg_to_logspec(beat, fs=self.fs, leads_idx=self.leads_idx)  # (3,H,W) tensor
        y = int(self.y_per_record[ri])  # record label -> beat label
        return img, y, ri  # include record idx for eval aggregation


In [5]:
# --- Model builder (ResNet-18, ImageNet-pretrained) ---
import torch, torch.nn as nn
import torchvision as tv

# Safe to redefine; keeps your notebook self-contained
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def make_model(num_classes: int = 2, freeze_backbone: bool = True) -> nn.Module:
    """ResNet-18 with a new classification head."""
    try:
        # Newer torchvision (>=0.13)
        weights = tv.models.ResNet18_Weights.IMAGENET1K_V1
        model = tv.models.resnet18(weights=weights)
    except Exception:
        # Fallback for older torchvision
        model = tv.models.resnet18(pretrained=True)

    if freeze_backbone:
        for p in model.parameters():
            p.requires_grad = False

    in_feats = model.fc.in_features
    model.fc = nn.Linear(in_feats, num_classes)
    return model


In [6]:
import torch, torch.nn as nn
from collections import defaultdict
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, classification_report

def train_one_epoch_beats(model, loader, opt, criterion, device="cuda"):
    model.train()
    total, loss_sum, y_true, y_pred = 0, 0.0, [], []
    for x, y, _ in loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        opt.step()
        loss_sum += float(loss) * y.size(0)
        total   += y.size(0)
        y_true += y.cpu().tolist()
        y_pred += torch.argmax(logits, 1).cpu().tolist()
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average='macro')
    return loss_sum/total, acc, f1m

@torch.no_grad()
def evaluate_by_record(model, loader, criterion, device="cuda", class_names=("normal","hyper")):
    model.eval()
    total, loss_sum = 0, 0.0
    rec_probs = defaultdict(list)
    rec_true  = {}

    for x, y, rec_idx in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()  # positive class prob

        loss_sum += float(loss) * y.size(0)
        total += y.size(0)

        rec_idx = rec_idx.cpu().numpy()
        y_np = y.cpu().numpy()
        for r, p, t in zip(rec_idx, probs, y_np):
            rec_probs[int(r)].append(float(p))
            rec_true[int(r)] = int(t)

    # Aggregate per record (mean prob)
    rec_ids = sorted(rec_true.keys())
    y_true_rec = np.array([rec_true[r] for r in rec_ids])
    y_score_rec = np.array([np.mean(rec_probs[r]) for r in rec_ids])
    y_hat_rec = (y_score_rec >= 0.5).astype(int)

    acc  = accuracy_score(y_true_rec, y_hat_rec)
    f1m  = f1_score(y_true_rec, y_hat_rec, average='macro')
    auroc = roc_auc_score(y_true_rec, y_score_rec) if len(set(y_true_rec))==2 else float('nan')
    auprc = average_precision_score(y_true_rec, y_score_rec) if len(set(y_true_rec))==2 else float('nan')

    print("\nRecord-level classification report:")
    print(classification_report(y_true_rec, y_hat_rec, target_names=class_names, digits=3))
    return (loss_sum/total), acc, f1m, auroc, auprc


In [None]:
# Point these at your folder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
DATA_DIR   = r"C:\Users\sarat\Desktop\ECG\x_nguyen"
LABEL_CSV  = rf"{DATA_DIR}\ecg-ForNguyenSimulation-with-label.csv"
SIGNAL_DIR = DATA_DIR

# 1) Load records (patient-wise)
records_train, labels_train, records_val, labels_val = load_nguyen_dataset(
    LABEL_CSV, SIGNAL_DIR, fs=1000, test_size=0.2
)

# 2) Build beat-image datasets (R-peaks via NeuroKit2; discard first/last)
beat_train = ECGBeatsImageDataset(records_train, labels_train, fs=1000,
                                  detect_lead=0, pre_ms=200, post_ms=400,
                                  max_beats_per_record=20,
                                  discard_first_last=True, leads_idx=(0,1,2))
beat_val   = ECGBeatsImageDataset(records_val,   labels_val,   fs=1000,
                                  detect_lead=0, pre_ms=200, post_ms=400,
                                  max_beats_per_record=None,
                                  discard_first_last=True, leads_idx=(0,1,2))

from torch.utils.data import DataLoader
tr_loader = DataLoader(beat_train, batch_size=64, sampler=None, shuffle=True,  num_workers=2, pin_memory=True)
va_loader = DataLoader(beat_val,   batch_size=128, shuffle=False, num_workers=2, pin_memory=True)



# 3) Train as you had:
model = make_model(num_classes=2, freeze_backbone=True).to(DEVICE)

# class-weighted loss from record labels

classes = np.array([0,1])
cw = compute_class_weight("balanced", classes=classes, y=np.array(labels_train))
criterion = nn.CrossEntropyLoss(weight=torch.tensor(cw, dtype=torch.float32, device=DEVICE))

opt = torch.optim.AdamW(model.fc.parameters(), lr=1e-3, weight_decay=1e-4)
for ep in range(1, 5+1):
    tr_loss, tr_acc, tr_f1 = train_one_epoch_beats(model, tr_loader, opt, criterion, device=DEVICE)
    va_loss, va_acc, va_f1, va_auc, va_aupr, best_thr = evaluate_by_record(model, va_loader, criterion, device=DEVICE)
    print(f"[Head] Ep{ep:02d} | tr_acc {tr_acc:.3f} f1 {tr_f1:.3f} | val_acc {va_acc:.3f} f1 {va_f1:.3f} auroc {va_auc:.3f} auprc {va_aupr:.3f}")


# Unfreeze last ResNet block + head with discriminative LRs
for n, p in model.named_parameters():
    if n.startswith("layer4"):
        p.requires_grad = True
opt = torch.optim.AdamW([
    {"params": model.fc.parameters(), "lr": 1e-3},
    {"params": [p for n,p in model.named_parameters() if n.startswith("layer4") and p.requires_grad], "lr": 1e-4},
], weight_decay=1e-4)

for ep in range(1, 10+1):
    tr_loss, tr_acc, tr_f1 = train_one_epoch_beats(model, tr_loader, opt, criterion, device=DEVICE)
    va_loss, va_acc, va_f1, va_auc, va_aupr = evaluate_by_record(model, va_loader, criterion, device=DEVICE)
    print(f"[FT]   Ep{ep:02d} | tr_acc {tr_acc:.3f} f1 {tr_f1:.3f} | "
          f"val_rec_acc {va_acc:.3f} f1 {va_f1:.3f} auroc {va_auc:.3f} auprc {va_aupr:.3f}")


Reading ECG files ...


100%|█████████████████████████████████████████████████████████████████████████████| 1670/1670 [00:08<00:00, 185.71it/s]


Loaded 1335 train / 335 val (positives: 671 / 150)


