# 01 — Feature Baseline (ECG-only)

This notebook:
1. Loads the beat-level table produced in `00_preview_data`.
2. Builds ECG-only features per beat (HR/HRV + simple morphology around R).
3. Trains Ridge/XGBoost with **GroupKFold** by `subject_id`.
4. Reports MAE/RMSE/Pearson r and makes Bland–Altman plots.

In [None]:
# Optional installs (uncomment locally)
# !pip install pandas numpy scipy pyarrow scikit-learn xgboost matplotlib neurokit2

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

DATA_INTERIM = Path('data/interim')
DATA_PROCESSED = Path('data/processed')
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

beat_df = pd.read_parquet(DATA_INTERIM/'beat_table.parquet')
beat_df.head()

## 1) Simple ECG-only features
Here we synthesize a few simple per-beat features from timings. For a richer set, you would build from ECG windows around R.

In [None]:
# For demo, compute RR interval-derived HR and simple deltas
beat_df = beat_df.sort_values(['subject_id','beat_idx']).reset_index(drop=True)

# Approximate RR from r_time
beat_df['RR'] = beat_df.groupby('subject_id')['r_time'].diff().fillna(method='bfill')
beat_df['HR'] = 60.0 / beat_df['RR'].clip(lower=1e-3)

# Rolling stats (proxy HRV)
beat_df['HR_mean_5'] = beat_df.groupby('subject_id')['HR'].rolling(5, min_periods=1).mean().reset_index(0,drop=True)
beat_df['HR_std_5']  = beat_df.groupby('subject_id')['HR'].rolling(5, min_periods=1).std().reset_index(0,drop=True).fillna(0.0)

features = ['HR','HR_mean_5','HR_std_5']
X = beat_df[features].values
y_sbp = beat_df['SBP'].values
y_dbp = beat_df['DBP'].values
groups = beat_df['subject_id'].values

## 2) Subject-wise CV (GroupKFold) + Ridge

In [None]:
gkf = GroupKFold(n_splits=5)

metrics = []
for target, y in [('SBP', y_sbp), ('DBP', y_dbp)]:
    preds = np.zeros_like(y, dtype=float)
    for tr, va in gkf.split(X, y, groups):
        m = Ridge(alpha=1.0)
        m.fit(X[tr], y[tr])
        preds[va] = m.predict(X[va])
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds, squared=False)
    r, _ = pearsonr(y, preds)
    metrics.append((target, mae, rmse, float(r)))

pd.DataFrame(metrics, columns=['Target','MAE','RMSE','Pearson r'])

## 3) Bland–Altman plots

In [None]:
def bland_altman(y_true, y_pred):
    m = (y_true + y_pred)/2.0
    d = y_pred - y_true
    md = float(np.mean(d))
    sd = float(np.std(d, ddof=1))
    loa_lo = md - 1.96*sd
    loa_hi = md + 1.96*sd
    plt.figure()
    plt.scatter(m, d, s=5)
    plt.axhline(md)
    plt.axhline(loa_lo)
    plt.axhline(loa_hi)
    plt.title('Bland–Altman')
    plt.xlabel('Mean of true & pred (mmHg)')
    plt.ylabel('Pred - True (mmHg)')
    plt.show()
    return md, sd

# Example for SBP from the last fold run above — recompute for display
from sklearn.linear_model import Ridge
preds = np.zeros_like(y_sbp, dtype=float)
for tr, va in gkf.split(X, y_sbp, groups):
    m = Ridge(alpha=1.0)
    m.fit(X[tr], y_sbp[tr])
    preds[va] = m.predict(X[va])
_ = bland_altman(y_sbp, preds)