In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (LEAK-FREE & PRODUCTION-READY)
# Audio (HeAR + LogReg) + Metadata (LightGBM)
# ============================================================================

# ── CELL 1: Imports & Seeds ───────────────────────────────────────────────────
import os, sys, json, warnings, random, hashlib, zipfile, shutil
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
print(f"Python    : {sys.version}")
print(f"sklearn   : {sklearn.__version__}")
print(f"librosa   : {librosa.__version__}")
print(f"numpy     : {np.__version__}")
print(f"pandas    : {pd.__version__}")

try:
    import lightgbm as lgb; HAS_LGB = True
    print(f"lightgbm  : {lgb.__version__}")
except ImportError:
    HAS_LGB = False
    print("lightgbm  : NOT FOUND — using GradientBoostingClassifier")

try:
    import tensorflow as tf
    tf.random.set_seed(SEED)
    print(f"tensorflow: {tf.__version__}")
except ImportError:
    print("tensorflow: NOT FOUND")

from sklearn.model_selection import StratifiedGroupKFold

# ── CELL 2: Configuration ─────────────────────────────────────────────────────
BASE   = "/kaggle/input/tb-audio/Tuberculosis"
META   = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"   

# Metadata files
CLINICAL_CSV   = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV  = f"{META}/CODA_TB_Solicited_Meta_Info.csv"   

# ---- Audio ----
SR          = 16_000
WIN_SECS    = 2.0
WIN_SAMPLES = int(SR * WIN_SECS)
ENERGY_THRESH_S = 2.2       

# ---- Training ----
N_SPLITS     = 5             # Replacing predefined folds with 5 dynamic folds
TARGET_SENS  = [0.85, 0.90, 0.95]
MISS_AUG_P   = 0.20          
CALIBRATE    = True
LGB_N_ITER   = 500
LGB_LR       = 0.05
INNER_VAL_FRAC = 0.20        

# ---- Output ----
OUT_ROOT  = "/kaggle/working/outputs"
AUDIO_OUT = os.path.join(OUT_ROOT, "audio_model")
META_OUT  = os.path.join(OUT_ROOT, "metadata_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [AUDIO_OUT, META_OUT, CACHE_DIR,
          f"{AUDIO_OUT}/plots", f"{META_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_embeddings.parquet")
print(f"Output root: {OUT_ROOT}")

# ── CELL 3: Build the master cough-level manifest ─────────────────────────────
def harmonise_fold_df(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result","gold_standard"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try:    return int(float(s))
        except: return np.nan
    return series.apply(_b)

def resolve_audio_paths(filenames, audio_dir=AUDIO_BASE):
    lookup = {}
    for dirpath, _, fns in os.walk(audio_dir):
        for fn in fns:
            if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
                full = os.path.join(dirpath, fn)
                lookup[fn] = full
                lookup[os.path.splitext(fn)[0]] = full
    def _resolve(fn):
        if pd.isna(fn): return np.nan
        fn = str(fn)
        if fn in lookup: return lookup[fn]
        stem = os.path.splitext(fn)[0]
        if stem in lookup: return lookup[stem]
        if os.path.isfile(fn): return fn
        return np.nan
    return filenames.apply(_resolve), lookup

print("Loading raw solicited data manifest …")
raw_audio_df = pd.read_csv(SOLICITED_CSV)
raw_audio_df = harmonise_fold_df(raw_audio_df)

# If label isn't in audio manifest, we will merge it from clinical
if "label_raw" not in raw_audio_df.columns:
    print("Label not in audio manifest, will extract from clinical data.")

# ── CELL 4: Join clinical metadata ────────────────────────────────────────────
POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","drug_",
                 "microscopy","molecular","confirmatory","reference_test",
                 "gold_standard","diagnosis","tb_status","tb_result",
                 "label","label_raw","_fold","_split","filename","audio_path"]

print("\nLoading clinical metadata …")
clinical_df = pd.read_csv(CLINICAL_CSV)
clinical_df = harmonise_fold_df(clinical_df)

# Ensure raw_audio_df gets labels if missing
if "label_raw" not in raw_audio_df.columns and "label_raw" in clinical_df.columns:
    raw_audio_df = raw_audio_df.merge(clinical_df[["participant_id", "label_raw"]], on="participant_id", how="left")

raw_audio_df["label"] = binarise_label(raw_audio_df["label_raw"])
raw_audio_df = raw_audio_df.dropna(subset=["label"]).reset_index(drop=True)
raw_audio_df["label"] = raw_audio_df["label"].astype(int)

def get_meta_cols(df):
    skip = set(POST_DIAG_KW) | {"participant_id"}
    num_cols, cat_cols = [], []
    for c in df.columns:
        if any(kw in c.lower() for kw in POST_DIAG_KW): continue
        if c in skip: continue
        if df[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
        else: cat_cols.append(c)
    return num_cols, cat_cols

clinical_num, clinical_cat = get_meta_cols(clinical_df)

cough_df = raw_audio_df.merge(
    clinical_df[["participant_id"] + clinical_num + clinical_cat],
    on="participant_id", how="left"
)

print("\nResolving audio file paths …")
cough_df["audio_path"], audio_lookup = resolve_audio_paths(cough_df["filename"])
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

# ── CELL 5: Sanity assertions ─────────────────────────────────────────────────
print("\n── Sanity checks ──")
n_pos = cough_df["label"].sum()
n_neg = (cough_df["label"] == 0).sum()
prev = n_pos / len(cough_df)

print(f"  ✓ Total valid cough rows : {len(cough_df)}")
print(f"  ✓ Participants         : {cough_df['participant_id'].nunique()}")
print(f"  ✓ TB+ coughs           : {n_pos} ({100*prev:.1f}%)")
print(f"  ✓ TB- coughs           : {n_neg}")

# ── CELL 6: Fold assignment (LEAK-FREE DYNAMIC SPLIT) ─────────────────────────
print("\nBuilding Custom Stratified Group K-Folds (Leak-Free) ...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    y_te = cough_df.loc[te_idx, "label"].values
    n_tr_parts = cough_df.loc[tr_idx, "participant_id"].nunique()
    n_te_parts = cough_df.loc[te_idx, "participant_id"].nunique()
    print(f"  Fold {fold_i}: train={len(tr_idx)} rows/{n_tr_parts} subjects  "
          f"test={len(te_idx)} rows/{n_te_parts} subjects  "
          f"TB+_test={int(y_te.sum())}/{len(y_te)}")

# ── CELL 7: Audio loading & window selection ──────────────────────────────────
def compute_audio_quality(audio, sr=SR):
    duration    = len(audio) / sr
    clip_ratio  = float(np.mean(np.abs(audio) > 0.99))
    frame_len   = 400; hop = 160
    frames = librosa.util.frame(audio, frame_length=frame_len, hop_length=hop)
    rms    = np.sqrt(np.mean(frames**2, axis=0)) + 1e-9
    snr_db = float(20 * np.log10(rms.max() / rms.min()))
    return {"duration_s": round(duration, 3), "clip_ratio": round(clip_ratio, 4), "snr_proxy_db": round(snr_db, 2)}

def select_best_window(audio, sr=SR):
    """Deterministically select the highest-energy 2-second segment."""
    n = len(audio)
    if n == 0:
        return np.zeros(WIN_SAMPLES, np.float32)
    peak = np.max(np.abs(audio))
    if peak > 0: audio = audio / peak

    if n / sr <= ENERGY_THRESH_S:
        if n < WIN_SAMPLES: audio = np.pad(audio, (0, WIN_SAMPLES - n))
        return audio[:WIN_SAMPLES].astype(np.float32)

    frame_len = 400; hop = 160
    frames     = librosa.util.frame(audio, frame_length=frame_len, hop_length=hop)
    rms        = np.sqrt(np.mean(frames**2, axis=0))
    smooth_n   = max(1, int(0.2 * sr / hop))
    rms_smooth = np.convolve(rms, np.ones(smooth_n)/smooth_n, mode="same")
    peak_fr    = int(np.argmax(rms_smooth))
    center     = peak_fr * hop + frame_len // 2
    start      = max(0, center - WIN_SAMPLES // 2)
    end        = start + WIN_SAMPLES
    if end > n:
        end = n; start = max(0, n - WIN_SAMPLES)
    seg = audio[start:end]
    if len(seg) < WIN_SAMPLES: seg = np.pad(seg, (0, WIN_SAMPLES - len(seg)))
    return seg[:WIN_SAMPLES].astype(np.float32)

def load_and_select(path):
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        qual = compute_audio_quality(audio)
        seg  = select_best_window(audio)
        return seg, qual
    except Exception as e:
        return None, None

# ── CELL 8: HeAR model + disk cache ──────────────────────────────────────────
print("\nLoading HeAR model …")
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL   = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    EMBED_DIM    = 512
    print("✓ HeAR loaded")
except Exception as e:
    print(f"  ⚠ HeAR load failed: {e}")
    HEAR_MODEL = HEAR_SERVING = None
    EMBED_DIM  = 512

def _path_key(path):
    return hashlib.md5(f"{HEAR_VERSION}::{path}".encode()).hexdigest()

def _load_cache():
    if os.path.isfile(EMBED_CACHE):
        try:
            df = pd.read_parquet(EMBED_CACHE)
            return df.set_index("key") if "key" in df.columns else df
        except: pass
    return pd.DataFrame(columns=["key","embedding"]).set_index("key")

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def get_embeddings(df_rows, batch_size=64, desc=""):
    from tqdm.auto import tqdm
    cache = _load_cache()
    N = len(df_rows)
    embeddings  = np.zeros((N, EMBED_DIM), np.float32)
    
    keys = [_path_key(str(r.audio_path)) if pd.notna(r.audio_path) else None for _, r in df_rows.iterrows()]
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] is not None and keys[i] not in cache.index]

    buf_segs, buf_keys = [], []
    def flush():
        if not buf_segs: return
        embs = _infer_batch(buf_segs)
        new_rows = [{"key": k, "embedding": e.tolist()} for k, e in zip(buf_keys, embs)]
        nonlocal cache
        cache = pd.concat([cache, pd.DataFrame(new_rows).set_index("key")])
        buf_segs.clear(); buf_keys.clear()

    for i, row in tqdm(need, desc=f"HeAR [{desc}]", leave=False):
        seg, _ = load_and_select(row.audio_path)
        if seg is not None:
            buf_segs.append(seg); buf_keys.append(keys[i])
        if len(buf_segs) >= batch_size: flush()
    flush()
    
    cache.reset_index().to_parquet(EMBED_CACHE, index=False)
    for i, (_, row) in enumerate(df_rows.iterrows()):
        k = keys[i]
        if k in cache.index:
            val = cache.loc[k, "embedding"]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
    return embeddings

# ── CELL 9: Metadata preprocessing ───────────────────────────────────────────
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

META_COLS_NUM = clinical_num
META_COLS_CAT = clinical_cat
ALL_META_COLS = META_COLS_NUM + META_COLS_CAT

class MissingnessAugmenter(BaseEstimator, TransformerMixin):
    def __init__(self, p=MISS_AUG_P, seed=SEED):
        self.p = p; self.seed = seed
    def fit(self, X, y=None): return self
    def fit_transform(self, X, y=None, **kw):
        rng   = np.random.RandomState(self.seed)
        X_out = X.copy() if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        mask  = rng.random(X_out.shape) < self.p
        X_out[mask] = np.nan
        return X_out
    def transform(self, X): return X

def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="__missing__")), 
                                              ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

def preprocess_meta_fold(df_train, df_val, df_test, num_cols, cat_cols):
    all_cols = num_cols + cat_cols
    def add_miss_indicators(df):
        d = df[all_cols].copy()
        for c in all_cols: d[f"__miss_{c}"] = d[c].isna().astype(np.float32)
        return d
        
    X_tr_raw  = add_miss_indicators(df_train)
    X_val_raw = add_miss_indicators(df_val)
    X_te_raw  = add_miss_indicators(df_test)
    ind_cols = [f"__miss_{c}" for c in all_cols]

    aug = MissingnessAugmenter(p=MISS_AUG_P, seed=SEED)
    X_tr_feat = aug.fit_transform(df_train[all_cols])
    
    prep = build_meta_preprocessor(num_cols, cat_cols)
    prep.fit(X_tr_feat)

    def transform_and_stack(feat_df, ind_df):
        transformed = prep.transform(feat_df)
        indicators  = ind_df[ind_cols].values.astype(np.float32)
        return np.hstack([transformed, indicators])

    return transform_and_stack(X_tr_feat, X_tr_raw), transform_and_stack(df_val[all_cols], X_val_raw), transform_and_stack(df_test[all_cols], X_te_raw), prep

# ── CELL 10: Model builders ───────────────────────────────────────────────────
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

def build_audio_clf():
    return Pipeline([
        ("sc",  StandardScaler()),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=5000, C=1.0, solver="lbfgs", random_state=SEED)),
    ])

def build_meta_clf(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=LGB_N_ITER, learning_rate=LGB_LR,
            num_leaves=15, max_depth=4,         # Prevent Overfitting
            subsample=0.8, colsample_bytree=0.8,
            min_child_samples=10,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1,
        )
    from sklearn.ensemble import GradientBoostingClassifier
    return GradientBoostingClassifier(n_estimators=200, learning_rate=LGB_LR, max_depth=4, subsample=0.8, random_state=SEED)

def calibrate(clf, X_cal, y_cal):
    cal = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid") # Changed to sigmoid for stability on small sets
    cal.fit(X_cal, y_cal)
    return cal

# ── CELL 11 & 12: Evaluation & Plotting ───────────────────────────────────────
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score, f1_score, confusion_matrix, brier_score_loss, roc_curve, precision_recall_curve)
from scipy.stats import spearmanr
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

C_POS, C_NEG = "#e63946", "#457b9d"

def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)),
            "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9),
            "precision": tp/(tp+fp+1e-9), "npv": tn/(tn+fn+1e-9), "f1": float(f1_score(y_true, y_pred, zero_division=0))}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob, val_true=None, val_prob=None):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {}
    m["roc_auc"] = float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan
    m["pr_auc"]  = float(average_precision_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan
    m["brier"]   = float(brier_score_loss(y_true, y_prob))
    m["spearman_rho"]= float(spearmanr(y_prob, y_true).statistic)
    m.update(metrics_at_thresh(y_true, y_prob, 0.5))
    
    tune_t = val_true if val_true is not None else y_true
    tune_p = val_prob if val_prob is not None else y_prob
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(tune_t, tune_p, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def _save(fig, path):
    fig.tight_layout(); fig.savefig(path, dpi=150); plt.close(fig)

def save_all_plots(y_true, y_prob, plot_dir, prefix, best_t=0.5):
    if len(np.unique(y_true)) < 2: return
    y_true=np.array(y_true); y_prob=np.array(y_prob)
    
    # ROC
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color=C_POS, lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{prefix} ROC"); ax.legend(); _save(fig, f"{plot_dir}/{prefix}_roc.png")
    
    # PR
    p, r, _ = precision_recall_curve(y_true, y_prob); ap = average_precision_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(r, p, color=C_POS, lw=2, label=f"AP={ap:.3f}")
    ax.set(title=f"{prefix} PR"); ax.legend(); _save(fig, f"{plot_dir}/{prefix}_pr.png")

# ── CELL 13: Inner-fold val split helper ─────────────────────────────────────
def inner_val_split(df_sub, val_frac=INNER_VAL_FRAC):
    n_splits = max(2, int(round(1/val_frac)))
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    splits = list(sgkf.split(df_sub, df_sub["label"], df_sub["participant_id"]))
    tr_idx, val_idx = splits[0]
    return df_sub.iloc[tr_idx].reset_index(drop=True), df_sub.iloc[val_idx].reset_index(drop=True)

# ── CELL 14 & 15: AUDIO MODEL CV & OOF ────────────────────────────────────────
print("\n" + "="*70)
print("AUDIO MODEL  —  HeAR + LogisticRegression")
print("="*70)

audio_oof_rows, fold_metrics_aud = [], []

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n── Audio Fold {fold_i+1} ──")
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    df_tr, df_val = inner_val_split(df_tr_full)

    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values

    X_tr_emb  = get_embeddings(df_tr,  desc=f"F{fold_i}_tr")
    X_val_emb = get_embeddings(df_val, desc=f"F{fold_i}_val")
    X_te_emb  = get_embeddings(df_te,  desc=f"F{fold_i}_te")

    clf_a = build_audio_clf().fit(X_tr_emb, y_tr)

    if CALIBRATE and len(np.unique(y_val)) >= 2:
        inner_lr = clf_a.named_steps["clf"]
        cal_a    = calibrate(inner_lr, clf_a.named_steps["sc"].transform(X_val_emb), y_val)
        _proba_a = lambda X: cal_a.predict_proba(clf_a.named_steps["sc"].transform(X))[:,1]
    else:
        cal_a = None
        _proba_a = lambda X: clf_a.predict_proba(X)[:,1]

    val_prob, te_prob = _proba_a(X_val_emb), _proba_a(X_te_emb)
    
    fm = full_eval(y_te, te_prob, val_true=y_val, val_prob=val_prob)
    fold_metrics_aud.append(fm)
    print(f"  AUC={fm['roc_auc']:.3f}  PR-AUC={fm['pr_auc']:.3f}")

    for pid, p, lbl in zip(df_te["participant_id"], te_prob, y_te):
        audio_oof_rows.append({"participant_id":pid,"fold":fold_i,"prob":p,"label":lbl})

aud_oof = pd.DataFrame(audio_oof_rows)
yt_a, yp_a = aud_oof["label"].values, aud_oof["prob"].values
m_aud_cough = full_eval(yt_a, yp_a)
part_a = aud_oof.groupby("participant_id").agg(prob=("prob","max"), label=("label","first")).reset_index()
m_aud_part = full_eval(part_a["label"].values, part_a["prob"].values)
save_all_plots(yt_a, yp_a, f"{AUDIO_OUT}/plots", "audio", best_t=find_thresh_for_sens(yt_a, yp_a, 0.90))

# ── CELL 16 & 17: METADATA MODEL CV & OOF ──────────────────────────────────────
print("\n" + "="*70)
print("METADATA MODEL  —  LightGBM + missingness augmentation")
print("="*70)

meta_oof_rows, fold_metrics_meta = [], []

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n── Meta Fold {fold_i+1} ──")
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    df_tr, df_val = inner_val_split(df_tr_full)

    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values

    X_tr, X_val_m, X_te_m, prep = preprocess_meta_fold(df_tr, df_val, df_te, META_COLS_NUM, META_COLS_CAT)

    clf_m = build_meta_clf(int(y_tr.sum()), int((y_tr==0).sum()))
    if HAS_LGB and len(np.unique(y_val)) >= 2:
        clf_m.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr),(X_val_m, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    else:
        clf_m.fit(X_tr, y_tr)

    if CALIBRATE and len(np.unique(y_val)) >= 2:
        cal_m    = calibrate(clf_m, X_val_m, y_val)
        val_prob = cal_m.predict_proba(X_val_m)[:,1]
        te_prob  = cal_m.predict_proba(X_te_m)[:,1]
    else:
        val_prob = clf_m.predict_proba(X_val_m)[:,1]
        te_prob  = clf_m.predict_proba(X_te_m)[:,1]

    fm = full_eval(y_te, te_prob, val_true=y_val, val_prob=val_prob)
    fold_metrics_meta.append(fm)
    print(f"  AUC={fm['roc_auc']:.3f}  PR-AUC={fm['pr_auc']:.3f}")

    for pid, p, lbl in zip(df_te["participant_id"], te_prob, y_te):
        meta_oof_rows.append({"participant_id":pid,"fold":fold_i,"prob":p,"label":lbl})

meta_oof = pd.DataFrame(meta_oof_rows)
yt_m, yp_m = meta_oof["label"].values, meta_oof["prob"].values
m_meta_cough = full_eval(yt_m, yp_m)
part_m = meta_oof.groupby("participant_id").agg(prob=("prob","max"), label=("label","first")).reset_index()
m_meta_part = full_eval(part_m["label"].values, part_m["prob"].values)
save_all_plots(yt_m, yp_m, f"{META_OUT}/plots", "meta", best_t=find_thresh_for_sens(yt_m, yp_m, 0.90))

# ── CELL 18-20: Reporting & Zipping ──────────────────────────────────────────
if m_aud_cough and m_meta_cough:
    fig, ax = plt.subplots(figsize=(6,5))
    fpr, tpr, _ = roc_curve(yt_a, yp_a); ax.plot(fpr, tpr, lw=2, color=C_POS, label=f"Audio AUC={roc_auc_score(yt_a,yp_a):.3f}")
    fpr, tpr, _ = roc_curve(yt_m, yp_m); ax.plot(fpr, tpr, lw=2, color=C_NEG, label=f"Meta AUC={roc_auc_score(yt_m,yp_m):.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title="ROC Comparison"); ax.legend(); _save(fig, f"{OUT_ROOT}/roc_comparison.png")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (cough)": f"{cough_m.get('roc_auc', 0):.3f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.3f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.3f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.3f}"
    }

summary_df = pd.DataFrame([make_row("Audio (HeAR+LR)", m_aud_cough, m_aud_part), 
                           make_row("Metadata (LightGBM)", m_meta_cough, m_meta_part)])

print("\n" + "="*100)
print("REPORT-READY SUMMARY (LEAK-FREE)")
print("="*100)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 2 - SCIENTIFIC BEST PRACTICE)
# Leak-Free CV + Mean-Pooled HeAR + MNAR-Aware LightGBM + Ensemble
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve, precision_recall_curve)
try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SECS = 2.0
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]

# Output Directories (V2)
OUT_ROOT = "/kaggle/working/outputs_v2"
AUDIO_OUT = os.path.join(OUT_ROOT, "audio_model")
META_OUT  = os.path.join(OUT_ROOT, "metadata_model")
ENS_OUT   = os.path.join(OUT_ROOT, "ensemble_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [AUDIO_OUT, META_OUT, ENS_OUT, CACHE_DIR, 
          f"{AUDIO_OUT}/plots", f"{META_OUT}/plots", f"{ENS_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_mean_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

print("Loading data...")
df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

# Identify safe metadata columns (excluding post-diag leaks)
POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

# Map physical audio files
lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

print(f"Total valid audio files: {len(cough_df)} | Participants: {cough_df['participant_id'].nunique()}")

# ── 3. STRATIFIED GROUP K-FOLD (LEAK-FREE SPLITS) ───────────────────────────
print("\nBuilding Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

for i, (tr, te) in enumerate(folds):
    tr_p = set(cough_df.loc[tr, "participant_id"])
    te_p = set(cough_df.loc[te, "participant_id"])
    assert len(tr_p & te_p) == 0, f"LEAK DETECTED IN FOLD {i}!"

# ── 4. AUDIO FEATURE EXTRACTION (MEAN-POOLING) ──────────────────────────────
print("\nLoading HeAR Model...")
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("✓ HeAR loaded")
except Exception as e:
    print(f"⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_and_chunk(path):
    """Slices entire audio into 2-second chunks for mean pooling."""
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        win_samples = int(SR * WIN_SECS)
        n = len(audio)
        if n == 0: return [np.zeros(win_samples, np.float32)]
        
        peak = np.max(np.abs(audio))
        if peak > 0: audio = audio / peak
        
        if n <= win_samples:
            return [np.pad(audio, (0, win_samples - n)).astype(np.float32)]
            
        chunks = []
        for start in range(0, n, win_samples):
            seg = audio[start : start + win_samples]
            if len(seg) < win_samples:
                seg = np.pad(seg, (0, win_samples - len(seg)))
            chunks.append(seg.astype(np.float32))
        return chunks
    except: return []

def get_mean_embeddings(df_rows):
    """Safe caching without Pandas index type confusion."""
    if os.path.exists(EMBED_CACHE):
        try:
            cache = pd.read_parquet(EMBED_CACHE)
        except:
            cache = pd.DataFrame(columns=["key", "embedding"])
    else:
        cache = pd.DataFrame(columns=["key", "embedding"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    new_entries = []
    for i, row in tqdm(need, desc="Extracting Audio (Mean Pooled)", leave=False):
        chunks = load_and_chunk(row.audio_path)
        if chunks:
            embs = _infer_batch(chunks)
            new_entries.append({"key": keys[i], "embedding": np.mean(embs, axis=0).tolist()})
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)  # FORCE pyarrow string type
        cache.to_parquet(EMBED_CACHE, index=False)
        
    # Dictionary lookup for maximum speed and safety
    cache_dict = dict(zip(cache["key"], cache["embedding"]))
        
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            val = cache_dict[k]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
            
    return embeddings

# ── 5. MNAR-AWARE PREPROCESSING & MODEL BUILDERS ────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([
            ("imp", SimpleImputer(strategy="median", add_indicator=True)), 
            ("sc", StandardScaler())
        ]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([
            ("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), 
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

def build_audio_clf():
    return Pipeline([
        ("sc", StandardScaler()),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=2000, C=0.1, random_state=SEED))
    ])

def build_meta_clf(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=300, learning_rate=0.03,
            num_leaves=15, max_depth=4,         
            subsample=0.8, colsample_bytree=0.8,
            min_child_samples=15,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression()

def calibrate(clf, X_cal, y_cal):
    cal = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid")
    cal.fit(X_cal, y_cal)
    return cal

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)),
            "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9),
            "precision": tp/(tp+fp+1e-9), "npv": tn/(tn+fn+1e-9), "f1": float(f1_score(y_true, y_pred, zero_division=0))}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {}
    m["roc_auc"] = float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan
    m["pr_auc"]  = float(average_precision_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\nStarting V2 CV Pipeline (Audio + Meta + Ensemble)...")
oof_aud, oof_meta, oof_ens = np.zeros(len(cough_df)), np.zeros(len(cough_df)), np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    # ── AUDIO PATH ──
    X_tr_emb  = get_mean_embeddings(df_tr)
    X_val_emb = get_mean_embeddings(df_val)
    X_te_emb  = get_mean_embeddings(df_te)
    
    clf_a = build_audio_clf().fit(X_tr_emb, y_tr)
    cal_a = calibrate(clf_a.named_steps["clf"], clf_a.named_steps["sc"].transform(X_val_emb), y_val)
    
    te_prob_a = cal_a.predict_proba(clf_a.named_steps["sc"].transform(X_te_emb))[:,1]
    oof_aud[te_idx] = te_prob_a
    
    # ── META PATH ──
    prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m  = prep.fit_transform(df_tr)
    X_val_m = prep.transform(df_val)
    X_te_m  = prep.transform(df_te)
    
    clf_m = build_meta_clf(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_m, y_tr)
    cal_m = calibrate(clf_m, X_val_m, y_val)
    
    te_prob_m = cal_m.predict_proba(X_te_m)[:,1]
    oof_meta[te_idx] = te_prob_m

    # ── ENSEMBLE PATH ──
    te_prob_ens = (te_prob_a + te_prob_m) / 2.0
    oof_ens[te_idx] = te_prob_ens
    
    print(f" Fold {fold_i+1} ROC-AUC | Audio: {roc_auc_score(y_te, te_prob_a):.3f} | Meta: {roc_auc_score(y_te, te_prob_m):.3f} | Ens: {roc_auc_score(y_te, te_prob_ens):.3f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_aud"] = oof_aud
cough_df["pred_meta"] = oof_meta
cough_df["pred_ens"] = oof_ens

# Participant-Level Aggregation
part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"),
    prob_aud=("pred_aud", "max"),
    prob_meta=("pred_meta", "max"),
    prob_ens=("pred_ens", "max")
).reset_index()

m_aud  = full_eval(cough_df['label'], oof_aud)
m_meta = full_eval(cough_df['label'], oof_meta)
m_ens  = full_eval(cough_df['label'], oof_ens)

p_aud  = full_eval(part_df['label'], part_df['prob_aud'])
p_meta = full_eval(part_df['label'], part_df['prob_meta'])
p_ens  = full_eval(part_df['label'], part_df['prob_ens'])

# Plotting
plot_curves(cough_df['label'], oof_aud, f"{AUDIO_OUT}/plots/audio", "Audio")
plot_curves(cough_df['label'], oof_meta, f"{META_OUT}/plots/meta", "Metadata")
plot_curves(cough_df['label'], oof_ens, f"{ENS_OUT}/plots/ensemble", "Ensemble")

# Generate Summary Table
def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (cough)": f"{cough_m.get('roc_auc', 0):.3f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.3f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.3f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.3f}"
    }

summary_df = pd.DataFrame([
    make_row("Audio (HeAR Mean-Pool)", m_aud, p_aud), 
    make_row("Metadata (MNAR LightGBM)", m_meta, p_meta),
    make_row("Fusion Ensemble (V2)", m_ens, p_ens)
])

print("\n" + "="*80)
print("REPORT-READY SUMMARY (VERSION 2 - BEST PRACTICES)")
print("="*80)
print(summary_df.to_string(index=False))

# Zipping Outputs
zip_path = "/kaggle/working/outputs_v2.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V2 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 3 - THE DeepGB-TB ADAPTATION)
# Leak-Free CV + Early Fusion + Cross-Modal LightGBM Feature Bottlenecking
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve, precision_recall_curve)
try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SECS = 2.0
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]

# Output Directories (V3)
OUT_ROOT = "/kaggle/working/outputs_v3"
FUSION_OUT = os.path.join(OUT_ROOT, "early_fusion_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_mean_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

print("Loading data...")
df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

# ── 3. STRATIFIED GROUP K-FOLD (LEAK-FREE SPLITS) ───────────────────────────
print("\nBuilding Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. AUDIO FEATURE EXTRACTION ─────────────────────────────────────────────
print("\nLoading HeAR Model...")
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("✓ HeAR loaded")
except Exception as e:
    print(f"⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_and_chunk(path):
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        win_samples = int(SR * WIN_SECS)
        n = len(audio)
        if n == 0: return [np.zeros(win_samples, np.float32)]
        peak = np.max(np.abs(audio))
        if peak > 0: audio = audio / peak
        if n <= win_samples:
            return [np.pad(audio, (0, win_samples - n)).astype(np.float32)]
        chunks = []
        for start in range(0, n, win_samples):
            seg = audio[start : start + win_samples]
            if len(seg) < win_samples:
                seg = np.pad(seg, (0, win_samples - len(seg)))
            chunks.append(seg.astype(np.float32))
        return chunks
    except: return []

def get_mean_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding"])
    else: cache = pd.DataFrame(columns=["key", "embedding"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    new_entries = []
    for i, row in tqdm(need, desc="Extracting Audio", leave=False):
        chunks = load_and_chunk(row.audio_path)
        if chunks:
            embs = _infer_batch(chunks)
            new_entries.append({"key": keys[i], "embedding": np.mean(embs, axis=0).tolist()})
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], cache["embedding"]))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            val = cache_dict[k]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
    return embeddings

# ── 5. MNAR PREPROCESSING & FUSION MODEL BUILDER ────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

def build_early_fusion_clf(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=400, learning_rate=0.02,
            num_leaves=31, max_depth=5,         
            subsample=0.8, 
            colsample_bytree=0.15, # <--- THE DeepGB-TB SECRET: Forces mixing of Audio & Meta
            min_child_samples=15,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced", max_iter=2000)

def calibrate(clf, X_cal, y_cal):
    cal = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid")
    cal.fit(X_cal, y_cal)
    return cal

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\nStarting V3 CV Pipeline (Early Fusion Cross-Attention Model)...")
oof_fusion = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    # Extract Embeddings (Audio)
    X_tr_emb  = get_mean_embeddings(df_tr)
    X_val_emb = get_mean_embeddings(df_val)
    X_te_emb  = get_mean_embeddings(df_te)
    
    # Preprocess Metadata (Tabular)
    prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m  = prep.fit_transform(df_tr)
    X_val_m = prep.transform(df_val)
    X_te_m  = prep.transform(df_te)
    
    # --- EARLY FUSION: Combine Audio & Metadata into one massive array ---
    X_tr_fusion  = np.hstack([X_tr_emb, X_tr_m])
    X_val_fusion = np.hstack([X_val_emb, X_val_m])
    X_te_fusion  = np.hstack([X_te_emb, X_te_m])
    
    # Train Cross-Modal LightGBM
    clf_fusion = build_early_fusion_clf(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_fusion, y_tr)
    cal_fusion = calibrate(clf_fusion, X_val_fusion, y_val)
    
    te_prob_fusion = cal_fusion.predict_proba(X_te_fusion)[:,1]
    oof_fusion[te_idx] = te_prob_fusion
    
    print(f" Fold {fold_i+1} Early Fusion ROC-AUC: {roc_auc_score(y_te, te_prob_fusion):.3f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_fusion"] = oof_fusion

part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), prob_fusion=("pred_fusion", "max")
).reset_index()

m_fusion = full_eval(cough_df['label'], oof_fusion)
p_fusion = full_eval(part_df['label'], part_df['prob_fusion'])

plot_curves(cough_df['label'], oof_fusion, f"{FUSION_OUT}/plots/early_fusion", "Early Fusion")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (cough)": f"{cough_m.get('roc_auc', 0):.3f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.3f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.3f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.3f}"
    }

summary_df = pd.DataFrame([make_row("Early Fusion (V3 - DeepGB-TB Inspired)", m_fusion, p_fusion)])

print("\n" + "="*80)
print("REPORT-READY SUMMARY (VERSION 3 - EARLY FUSION)")
print("="*80)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v3.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V3 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 4 - THE ACOUSTIC BOTTLENECK)
# Early Fusion + PCA Dimensionality Reduction + LightGBM
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve)
try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SECS = 2.0
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]
PCA_COMPONENTS = 32  # THE BOTTLENECK

# Output Directories (V4)
OUT_ROOT = "/kaggle/working/outputs_v4"
FUSION_OUT = os.path.join(OUT_ROOT, "pca_fusion_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_mean_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

print("Loading data...")
df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

# ── 3. STRATIFIED GROUP K-FOLD ──────────────────────────────────────────────
print("\nBuilding Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. AUDIO FEATURE EXTRACTION ─────────────────────────────────────────────
print("\nLoading HeAR Model...")
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("✓ HeAR loaded")
except Exception as e:
    print(f"⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_and_chunk(path):
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        win_samples = int(SR * WIN_SECS)
        n = len(audio)
        if n == 0: return [np.zeros(win_samples, np.float32)]
        peak = np.max(np.abs(audio))
        if peak > 0: audio = audio / peak
        if n <= win_samples:
            return [np.pad(audio, (0, win_samples - n)).astype(np.float32)]
        chunks = []
        for start in range(0, n, win_samples):
            seg = audio[start : start + win_samples]
            if len(seg) < win_samples:
                seg = np.pad(seg, (0, win_samples - len(seg)))
            chunks.append(seg.astype(np.float32))
        return chunks
    except: return []

def get_mean_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding"])
    else: cache = pd.DataFrame(columns=["key", "embedding"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    new_entries = []
    for i, row in tqdm(need, desc="Extracting Audio", leave=False):
        chunks = load_and_chunk(row.audio_path)
        if chunks:
            embs = _infer_batch(chunks)
            new_entries.append({"key": keys[i], "embedding": np.mean(embs, axis=0).tolist()})
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], cache["embedding"]))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            val = cache_dict[k]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
    return embeddings

# ── 5. PREPROCESSING & BOTTLENECK BUILDERS ──────────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

# Feature Compression Pipeline
def build_audio_pca_preprocessor():
    return Pipeline([
        ("sc", StandardScaler()),
        ("pca", PCA(n_components=PCA_COMPONENTS, random_state=SEED))
    ])

def build_early_fusion_clf(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=500, learning_rate=0.015,  # Slowed down learning rate for stability
            num_leaves=31, max_depth=5,         
            subsample=0.8, 
            colsample_bytree=0.6,  # We can increase this now because dimensions are balanced!
            min_child_samples=15,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced", max_iter=2000)

def calibrate(clf, X_cal, y_cal):
    cal = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid")
    cal.fit(X_cal, y_cal)
    return cal

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\nStarting V4 CV Pipeline (PCA Bottleneck + Early Fusion)...")
oof_fusion = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    # 1. Extract Raw Embeddings
    X_tr_emb_raw  = get_mean_embeddings(df_tr)
    X_val_emb_raw = get_mean_embeddings(df_val)
    X_te_emb_raw  = get_mean_embeddings(df_te)
    
    # 2. Apply PCA Bottleneck (Reduces 512-dim to 32-dim)
    pca_prep = build_audio_pca_preprocessor()
    X_tr_emb  = pca_prep.fit_transform(X_tr_emb_raw)
    X_val_emb = pca_prep.transform(X_val_emb_raw)
    X_te_emb  = pca_prep.transform(X_te_emb_raw)
    
    # 3. Preprocess Metadata (Tabular)
    meta_prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m  = meta_prep.fit_transform(df_tr)
    X_val_m = meta_prep.transform(df_val)
    X_te_m  = meta_prep.transform(df_te)
    
    # 4. EARLY FUSION: Combine Compressed Audio (32) & Metadata (~15)
    X_tr_fusion  = np.hstack([X_tr_emb, X_tr_m])
    X_val_fusion = np.hstack([X_val_emb, X_val_m])
    X_te_fusion  = np.hstack([X_te_emb, X_te_m])
    
    # 5. Train Cross-Modal LightGBM
    clf_fusion = build_early_fusion_clf(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_fusion, y_tr)
    cal_fusion = calibrate(clf_fusion, X_val_fusion, y_val)
    
    te_prob_fusion = cal_fusion.predict_proba(X_te_fusion)[:,1]
    oof_fusion[te_idx] = te_prob_fusion
    
    print(f" Fold {fold_i+1} PCA-Fusion ROC-AUC: {roc_auc_score(y_te, te_prob_fusion):.3f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_fusion"] = oof_fusion

part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), prob_fusion=("pred_fusion", "max")
).reset_index()

m_fusion = full_eval(cough_df['label'], oof_fusion)
p_fusion = full_eval(part_df['label'], part_df['prob_fusion'])

plot_curves(cough_df['label'], oof_fusion, f"{FUSION_OUT}/plots/pca_fusion", "PCA Early Fusion")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (cough)": f"{cough_m.get('roc_auc', 0):.3f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.3f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.3f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.3f}"
    }

summary_df = pd.DataFrame([make_row("PCA-Bottlenecked Fusion (V4)", m_fusion, p_fusion)])

print("\n" + "="*80)
print("REPORT-READY SUMMARY (VERSION 4 - PCA BOTTLENECK)")
print("="*80)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v4.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V4 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 5 - STATE OF THE ART)
# Asymmetric Blending (Stacking) + SMOTE + Clinical Attention Mechanism
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

try:
    from imblearn.over_sampling import SMOTE
    HAS_SMOTE = True
    print("✓ imbalanced-learn (SMOTE) found.")
except ImportError:
    HAS_SMOTE = False
    print("⚠ imbalanced-learn not found. SMOTE disabled.")

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SECS = 2.0
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]

# Output Directories (V5)
OUT_ROOT = "/kaggle/working/outputs_v5"
FUSION_OUT = os.path.join(OUT_ROOT, "sota_stacking_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_mean_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

print("Loading data...")
df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

# ── 3. STRATIFIED GROUP K-FOLD ──────────────────────────────────────────────
print("\nBuilding Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. AUDIO FEATURE EXTRACTION ─────────────────────────────────────────────
print("\nLoading HeAR Model...")
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("✓ HeAR loaded")
except Exception as e:
    print(f"⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_and_chunk(path):
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        win_samples = int(SR * WIN_SECS)
        n = len(audio)
        if n == 0: return [np.zeros(win_samples, np.float32)]
        peak = np.max(np.abs(audio))
        if peak > 0: audio = audio / peak
        if n <= win_samples:
            return [np.pad(audio, (0, win_samples - n)).astype(np.float32)]
        chunks = []
        for start in range(0, n, win_samples):
            seg = audio[start : start + win_samples]
            if len(seg) < win_samples:
                seg = np.pad(seg, (0, win_samples - len(seg)))
            chunks.append(seg.astype(np.float32))
        return chunks
    except: return []

def get_mean_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding"])
    else: cache = pd.DataFrame(columns=["key", "embedding"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    new_entries = []
    for i, row in tqdm(need, desc="Extracting Audio", leave=False):
        chunks = load_and_chunk(row.audio_path)
        if chunks:
            embs = _infer_batch(chunks)
            new_entries.append({"key": keys[i], "embedding": np.mean(embs, axis=0).tolist()})
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], cache["embedding"]))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            val = cache_dict[k]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
    return embeddings

# ── 5. PREPROCESSING & STACKING BUILDERS ────────────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

def build_audio_expert():
    return Pipeline([
        ("sc", StandardScaler()),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=2000, C=0.1, random_state=SEED))
    ])

def build_clinical_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=300, learning_rate=0.03, num_leaves=15, max_depth=4,         
            subsample=0.8, colsample_bytree=0.8, min_child_samples=15,
            scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced")

def build_supervisor_clf():
    # The Supervisor must be shallow to prevent overfitting the Expert's predictions
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=100, learning_rate=0.05,
            num_leaves=7, max_depth=3,  
            min_child_samples=10,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced", C=0.1)

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\nStarting V5 CV Pipeline (Asymmetric Stacking + SMOTE)...")
oof_stack = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    # SPLIT: 70% for Level-1 Experts, 30% for Level-2 Supervisor
    l1_split = int(len(df_tr_full) * 0.7)
    df_l1 = df_tr_full.iloc[:l1_split]
    df_l2 = df_tr_full.iloc[l1_split:]
    
    y_l1, y_l2, y_te = df_l1["label"].values, df_l2["label"].values, df_te["label"].values
    
    # ── LEVEL 1: AUDIO EXPERT ──
    X_l1_emb = get_mean_embeddings(df_l1)
    X_l2_emb = get_mean_embeddings(df_l2)
    X_te_emb = get_mean_embeddings(df_te)
    
    clf_aud = build_audio_expert().fit(X_l1_emb, y_l1)
    
    prob_l2_aud = clf_aud.predict_proba(X_l2_emb)[:,1]
    prob_te_aud = clf_aud.predict_proba(X_te_emb)[:,1]
    
    # ── LEVEL 1: CLINICAL EXPERT ──
    prep = build_meta_preprocessor(num_cols, cat_cols)
    X_l1_m = prep.fit_transform(df_l1)
    X_l2_m = prep.transform(df_l2)
    X_te_m = prep.transform(df_te)
    
    clf_meta = build_clinical_expert(int(y_l1.sum()), int((y_l1==0).sum())).fit(X_l1_m, y_l1)
    
    prob_l2_meta = clf_meta.predict_proba(X_l2_m)[:,1]
    prob_te_meta = clf_meta.predict_proba(X_te_m)[:,1]
    
    # ── LEVEL 2: THE SUPERVISOR ──
    # Concatenate the Expert probabilities WITH the raw clinical features
    X_l2_stack = np.column_stack([prob_l2_aud, prob_l2_meta, X_l2_m])
    X_te_stack = np.column_stack([prob_te_aud, prob_te_meta, X_te_m])
    
    # Apply SMOTE to the Supervisor's training data to learn the TB boundary flawlessly
    if HAS_SMOTE:
        smote = SMOTE(random_state=SEED)
        X_l2_resampled, y_l2_resampled = smote.fit_resample(X_l2_stack, y_l2)
    else:
        X_l2_resampled, y_l2_resampled = X_l2_stack, y_l2
        
    # Train Supervisor & Calibrate internally via CV to ensure perfect probabilities
    base_supervisor = build_supervisor_clf()
    cal_supervisor = CalibratedClassifierCV(base_supervisor, cv=3, method="sigmoid")
    cal_supervisor.fit(X_l2_resampled, y_l2_resampled)
    
    te_prob_stack = cal_supervisor.predict_proba(X_te_stack)[:,1]
    oof_stack[te_idx] = te_prob_stack
    
    print(f" Fold {fold_i+1} Supervisor ROC-AUC: {roc_auc_score(y_te, te_prob_stack):.3f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_stack"] = oof_stack

part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), prob_stack=("pred_stack", "max")
).reset_index()

m_stack = full_eval(cough_df['label'], oof_stack)
p_stack = full_eval(part_df['label'], part_df['prob_stack'])

plot_curves(cough_df['label'], oof_stack, f"{FUSION_OUT}/plots/sota_stacking", "Supervisor Stacking")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (cough)": f"{cough_m.get('roc_auc', 0):.3f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.3f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.3f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.3f}"
    }

summary_df = pd.DataFrame([make_row("Asymmetric Supervisor + SMOTE (V5)", m_stack, p_stack)])

print("\n" + "="*90)
print("REPORT-READY SUMMARY (VERSION 5 - SOTA STACKING)")
print("="*90)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v5.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V5 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 6 - EXACT 2-SECOND SOTA)
# PCA Bottleneck + Early Fusion + TRBL + Heavy Debugging
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SAMPLES = 32_000  # EXACTLY 2 SECONDS
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]
PCA_COMPONENTS = 32  

# Output Directories (V6)
OUT_ROOT = "/kaggle/working/outputs_v6"
FUSION_OUT = os.path.join(OUT_ROOT, "debug_fusion_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_exact2s_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

print(f"[*] Identified Numeric Clinical Features: {num_cols}")
print(f"[*] Identified Categorical Clinical Features: {cat_cols}")

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

print(f"[*] Total valid audio files mapped: {len(cough_df)}")
print(f"[*] Total unique participants: {cough_df['participant_id'].nunique()}")

# ── 3. STRATIFIED GROUP K-FOLD ──────────────────────────────────────────────
print("\n[*] Building Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. AUDIO FEATURE EXTRACTION (EXACTLY 2 SECONDS) ─────────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL")
print("="*60)
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("[*] ✓ HeAR loaded successfully")
except Exception as e:
    print(f"[*] ⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_exact_audio(path):
    """Loads audio and forces EXACTLY 32,000 samples (2 seconds)"""
    try:
        audio, sr = librosa.load(str(path), sr=SR, mono=True)
        if len(audio) < WIN_SAMPLES:
            audio = np.pad(audio, (0, WIN_SAMPLES - len(audio)))
        else:
            audio = audio[:WIN_SAMPLES]
        return audio
    except:
        return np.zeros(WIN_SAMPLES, np.float32)

def get_exact_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding"])
    else: cache = pd.DataFrame(columns=["key", "embedding"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    buf_segs, buf_keys = [], []
    new_entries = []
    
    for i, row in tqdm(need, desc="Extracting Audio (2-Sec Strict)", leave=False):
        seg = load_exact_audio(row.audio_path)
        buf_segs.append(seg)
        buf_keys.append(keys[i])
        
        if len(buf_segs) >= 64:
            embs = _infer_batch(buf_segs)
            new_entries.extend([{"key": k, "embedding": e.tolist()} for k, e in zip(buf_keys, embs)])
            buf_segs, buf_keys = [], []
            
    if buf_segs:
        embs = _infer_batch(buf_segs)
        new_entries.extend([{"key": k, "embedding": e.tolist()} for k, e in zip(buf_keys, embs)])
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], cache["embedding"]))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            val = cache_dict[k]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
            
    return embeddings

# ── 5. PREPROCESSING & RISK-BALANCED LIGHTGBM BUILDERS ──────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

def build_audio_pca_preprocessor():
    return Pipeline([("sc", StandardScaler()), ("pca", PCA(n_components=PCA_COMPONENTS, random_state=SEED))])

def build_risk_balanced_clf(n_pos, n_neg):
    trbl_scale = (n_neg / max(n_pos, 1)) * 1.5 
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=400, learning_rate=0.02,
            num_leaves=31, max_depth=5,         
            subsample=0.8, colsample_bytree=0.6,
            min_child_samples=15,
            scale_pos_weight=trbl_scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced", max_iter=2000)

def calibrate(clf, X_cal, y_cal):
    cal = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid")
    cal.fit(X_cal, y_cal)
    return cal

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\n" + "="*60)
print("3. STARTING V6 TRAINING (PCA FUSION + TRBL)")
print("="*60)
oof_fusion = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    print(f"[*] Fold Balances:")
    print(f"    Train: {len(y_tr)} (TB+: {y_tr.sum()}) | Val: {len(y_val)} (TB+: {y_val.sum()}) | Test: {len(y_te)} (TB+: {y_te.sum()})")
    
    # Extract Strict 2-sec Embeddings
    X_tr_emb_raw  = get_exact_embeddings(df_tr)
    X_val_emb_raw = get_exact_embeddings(df_val)
    X_te_emb_raw  = get_exact_embeddings(df_te)
    
    # PCA Bottleneck
    pca_prep = build_audio_pca_preprocessor()
    X_tr_emb  = pca_prep.fit_transform(X_tr_emb_raw)
    X_val_emb = pca_prep.transform(X_val_emb_raw)
    X_te_emb  = pca_prep.transform(X_te_emb_raw)
    
    print(f"[*] Audio Features reduced from {X_tr_emb_raw.shape[1]} to {X_tr_emb.shape[1]} via PCA")
    
    # Preprocess Metadata
    meta_prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m  = meta_prep.fit_transform(df_tr)
    X_val_m = meta_prep.transform(df_val)
    X_te_m  = meta_prep.transform(df_te)
    
    print(f"[*] Clinical Metadata Features generated (MNAR Aware): {X_tr_m.shape[1]}")
    
    # EARLY FUSION
    X_tr_fusion  = np.hstack([X_tr_emb, X_tr_m])
    X_val_fusion = np.hstack([X_val_emb, X_val_m])
    X_te_fusion  = np.hstack([X_te_emb, X_te_m])
    
    print(f"[*] Final Fused Training Matrix Shape: {X_tr_fusion.shape}")
    
    # Train Risk-Balanced LightGBM
    clf_fusion = build_risk_balanced_clf(int(y_tr.sum()), int((y_tr==0).sum()))
    clf_fusion.fit(X_tr_fusion, y_tr)
    cal_fusion = calibrate(clf_fusion, X_val_fusion, y_val)
    
    te_prob_fusion = cal_fusion.predict_proba(X_te_fusion)[:,1]
    oof_fusion[te_idx] = te_prob_fusion
    
    fold_auc = roc_auc_score(y_te, te_prob_fusion)
    print(f"[*] Fold {fold_i+1} ROC-AUC: {fold_auc:.4f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_fusion"] = oof_fusion

part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), prob_fusion=("pred_fusion", "max")
).reset_index()

m_fusion = full_eval(cough_df['label'], oof_fusion)
p_fusion = full_eval(part_df['label'], part_df['prob_fusion'])

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (recording)": f"{cough_m.get('roc_auc', 0):.4f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.4f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.4f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.4f}"
    }

summary_df = pd.DataFrame([make_row("V6 (Exact 2s + PCA + TRBL)", m_fusion, p_fusion)])

print("\n" + "="*85)
print("REPORT-READY SUMMARY (VERSION 6)")
print("="*85)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v6.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V6 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 7 - THE SOTA LIMIT BREAKER)
# OOF CVPEM Stacking + Unshackled 512-dim HeAR + TRBL Meta-Learner
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SAMPLES = 32_000  # EXACTLY 2 SECONDS (Based on original notebook constraints)
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]

# Output Directories (V7)
OUT_ROOT = "/kaggle/working/outputs_v7"
FUSION_OUT = os.path.join(OUT_ROOT, "oof_stacking_model")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_exact2s_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

print(f"[*] Total valid audio files mapped: {len(cough_df)}")
print(f"[*] Total unique participants: {cough_df['participant_id'].nunique()}")

# ── 3. STRATIFIED GROUP K-FOLD ──────────────────────────────────────────────
print("\n[*] Building Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. AUDIO FEATURE EXTRACTION (EXACTLY 2 SECONDS) ─────────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL")
print("="*60)
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("[*] ✓ HeAR loaded successfully")
except Exception as e:
    print(f"[*] ⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_exact_audio(path):
    try:
        audio, sr = librosa.load(str(path), sr=SR, mono=True)
        if len(audio) < WIN_SAMPLES:
            audio = np.pad(audio, (0, WIN_SAMPLES - len(audio)))
        else:
            audio = audio[:WIN_SAMPLES]
        return audio
    except:
        return np.zeros(WIN_SAMPLES, np.float32)

def get_exact_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding"])
    else: cache = pd.DataFrame(columns=["key", "embedding"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    buf_segs, buf_keys = [], []
    new_entries = []
    for i, row in tqdm(need, desc="Extracting Audio (2-Sec Strict)", leave=False):
        buf_segs.append(load_exact_audio(row.audio_path))
        buf_keys.append(keys[i])
        if len(buf_segs) >= 64:
            embs = _infer_batch(buf_segs)
            new_entries.extend([{"key": k, "embedding": e.tolist()} for k, e in zip(buf_keys, embs)])
            buf_segs, buf_keys = [], []
            
    if buf_segs:
        embs = _infer_batch(buf_segs)
        new_entries.extend([{"key": k, "embedding": e.tolist()} for k, e in zip(buf_keys, embs)])
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], cache["embedding"]))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            val = cache_dict[k]
            embeddings[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
            
    return embeddings

# ── 5. PREPROCESSING & OOF STACKING BUILDERS ────────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

# LEVEL 1 EXPERTS (No PCA, Audio gets full 512 dimensions!)
def build_audio_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(n_estimators=300, learning_rate=0.03, num_leaves=31, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced", max_iter=2000)

def build_clinical_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(n_estimators=200, learning_rate=0.03, num_leaves=15, max_depth=4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced")

# LEVEL 2 SUPERVISOR (With TRBL Loss)
def build_supervisor(n_pos, n_neg):
    # TRBL: Tuberculosis Risk-Balanced Loss (1.5x penalty for False Negatives)
    trbl_scale = (n_neg / max(n_pos, 1)) * 1.5 
    return LogisticRegression(class_weight={0: 1.0, 1: trbl_scale}, max_iter=2000, random_state=SEED)

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\n" + "="*60)
print("3. STARTING V7 TRAINING (OOF STACKING + UNSHACKLED HeAR)")
print("="*60)
oof_stack = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    print(f"[*] Fold Balances:")
    print(f"    Train: {len(y_tr)} (TB+: {y_tr.sum()}) | Val: {len(y_val)} (TB+: {y_val.sum()}) | Test: {len(y_te)} (TB+: {y_te.sum()})")
    
    # Extract Strict 2-sec Embeddings (All 512 dimensions)
    X_tr_emb = get_exact_embeddings(df_tr)
    X_val_emb = get_exact_embeddings(df_val)
    X_te_emb = get_exact_embeddings(df_te)
    
    # Preprocess Metadata
    meta_prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m = meta_prep.fit_transform(df_tr)
    X_val_m = meta_prep.transform(df_val)
    X_te_m = meta_prep.transform(df_te)
    
    # ── LEVEL 1: GENERATE OUT-OF-FOLD (OOF) PROBABILITIES FOR TRAIN ──
    print("[*] Generating Out-Of-Fold Probabilities for Level-1 Experts (100% Data Retained)")
    cv_inner = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=SEED)
    inner_folds = list(cv_inner.split(df_tr, y_tr, df_tr["participant_id"]))
    
    tr_oof_a = np.zeros(len(y_tr))
    tr_oof_m = np.zeros(len(y_tr))
    
    for i_tr, i_val in inner_folds:
        # Inner Audio Expert
        clf_a_inner = build_audio_expert(int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum()))
        clf_a_inner.fit(X_tr_emb[i_tr], y_tr[i_tr])
        tr_oof_a[i_val] = clf_a_inner.predict_proba(X_tr_emb[i_val])[:, 1]
        
        # Inner Clinical Expert
        clf_m_inner = build_clinical_expert(int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum()))
        clf_m_inner.fit(X_tr_m[i_tr], y_tr[i_tr])
        tr_oof_m[i_val] = clf_m_inner.predict_proba(X_tr_m[i_val])[:, 1]
        
    # ── LEVEL 1: TRAIN EXPERTS ON FULL TRAIN SET & PREDICT VAL/TEST ──
    print("[*] Training Level-1 Experts on Full Train Set")
    clf_a = build_audio_expert(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_emb, y_tr)
    val_prob_a = clf_a.predict_proba(X_val_emb)[:,1]
    te_prob_a = clf_a.predict_proba(X_te_emb)[:,1]
    
    clf_m = build_clinical_expert(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_m, y_tr)
    val_prob_m = clf_m.predict_proba(X_val_m)[:,1]
    te_prob_m = clf_m.predict_proba(X_te_m)[:,1]

    # ── LEVEL 2: THE SUPERVISOR (TRBL META-LEARNER) ──
    print("[*] Training Level-2 Supervisor on OOF Features")
    X_tr_stack = np.column_stack([tr_oof_a, tr_oof_m, X_tr_m])
    X_val_stack = np.column_stack([val_prob_a, val_prob_m, X_val_m])
    X_te_stack = np.column_stack([te_prob_a, te_prob_m, X_te_m])
    
    supervisor = build_supervisor(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_stack, y_tr)
    cal_supervisor = CalibratedClassifierCV(supervisor, cv="prefit", method="sigmoid")
    cal_supervisor.fit(X_val_stack, y_val)
    
    te_prob_stack = cal_supervisor.predict_proba(X_te_stack)[:,1]
    oof_stack[te_idx] = te_prob_stack
    
    fold_auc = roc_auc_score(y_te, te_prob_stack)
    print(f"[*] Fold {fold_i+1} Supervisor ROC-AUC: {fold_auc:.4f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_stack"] = oof_stack

part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), prob_stack=("pred_stack", "max")
).reset_index()

m_stack = full_eval(cough_df['label'], oof_stack)
p_stack = full_eval(part_df['label'], part_df['prob_stack'])

plot_curves(cough_df['label'], oof_stack, f"{FUSION_OUT}/plots/sota_stacking", "V7 Stacking")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (recording)": f"{cough_m.get('roc_auc', 0):.4f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.4f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.4f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.4f}"
    }

summary_df = pd.DataFrame([make_row("V7 (OOF CVPEM Stacking + TRBL)", m_stack, p_stack)])

print("\n" + "="*85)
print("REPORT-READY SUMMARY (VERSION 7 - SOTA LIMIT BREAKER)")
print("="*85)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v7.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V7 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
import os
import librosa
import pandas as pd
from tqdm.auto import tqdm

# Root directory from your screenshot
AUDIO_ROOT = "/kaggle/input/tb-audio/Tuberculosis/raw_data"

folder_stats = []

# Iterate through subfolders (longitudinal_data, solicited_data)
for folder_name in ['longitudinal_data', 'solicited_data']:
    folder_path = os.path.join(AUDIO_ROOT, folder_name)
    if not os.path.exists(folder_path):
        continue
        
    print(f"Processing folder: {folder_name}...")
    
    # Find all unique audio files in this specific folder
    files_in_folder = []
    for root, _, filenames in os.walk(folder_path):
        for fn in filenames:
            if fn.lower().endswith((".wav", ".ogg", ".flac", ".mp3")):
                files_in_folder.append(os.path.join(root, fn))
    
    unique_files = list(set(files_in_folder))
    
    durations = []
    sizes = []
    
    for path in tqdm(unique_files, desc=f"Analyzing {folder_name}", leave=False):
        try:
            # Fast duration check without loading the full waveform
            duration = librosa.get_duration(path=path)
            durations.append(duration)
            sizes.append(os.path.getsize(path) / (1024 * 1024)) # MB
        except Exception:
            continue

    if durations:
        folder_stats.append({
            "Folder": folder_name,
            "Unique Files": len(durations),
            "Total Minutes": sum(durations) / 60,
            "Avg Sec": sum(durations) / len(durations),
            "Total Size (MB)": sum(sizes)
        })

# Display the final breakdown
stats_df = pd.DataFrame(folder_stats)
print("\n" + "="*60)
print("AUDIO DATASET BREAKDOWN BY FOLDER")
print("="*60)
print(stats_df.to_string(index=False))

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 8.1 - SOTA LIMIT BREAKER)
# Acoustic Tiling (Zero-Silence) + OOF Stacking + Non-Linear Supervisor
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SAMPLES = 32_000  # Google HeAR HARDCODED constraint (2.0s)
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]

# Output Directories (V8.1)
OUT_ROOT = "/kaggle/working/outputs_v8_1"
FUSION_OUT = os.path.join(OUT_ROOT, "tiled_oof_stacking")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_tiled2s_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

print(f"[*] Total valid audio files mapped: {len(cough_df)}")
print(f"[*] Total unique participants: {cough_df['participant_id'].nunique()}")

# ── 3. STRATIFIED GROUP K-FOLD ──────────────────────────────────────────────
print("\n[*] Building Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. ACOUSTIC TILING (ZERO-SILENCE) ───────────────────────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL & EXTRACTING AUDIO")
print("="*60)
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("[*] ✓ HeAR loaded successfully")
except Exception as e:
    print(f"[*] ⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_tiled_audio(path):
    """If audio is < 2s, TILES (repeats) the audio instead of padding with silence."""
    try:
        audio, sr = librosa.load(str(path), sr=SR, mono=True)
        dur = len(audio) / sr
        
        if len(audio) == 0:
            return np.zeros(WIN_SAMPLES, np.float32), 0.0
            
        if len(audio) < WIN_SAMPLES:
            # TILING THE SIGNAL: Repeats 0.5s audio 4x to fill 2.0s
            repeats = int(np.ceil(WIN_SAMPLES / len(audio)))
            audio = np.tile(audio, repeats)[:WIN_SAMPLES]
        else:
            # Smart Energy Peak Detection for files > 2s
            frame_len = 400; hop = 160
            frames = librosa.util.frame(audio, frame_length=frame_len, hop_length=hop)
            rms = np.sqrt(np.mean(frames**2, axis=0))
            smooth_n = max(1, int(0.2 * sr / hop))
            rms_smooth = np.convolve(rms, np.ones(smooth_n)/smooth_n, mode="same")
            peak_fr = int(np.argmax(rms_smooth))
            center = peak_fr * hop + frame_len // 2
            
            start = max(0, center - WIN_SAMPLES // 2)
            end = start + WIN_SAMPLES
            if end > len(audio):
                end = len(audio); start = max(0, len(audio) - WIN_SAMPLES)
            audio = audio[start:end]
            
        return audio, dur
    except:
        return np.zeros(WIN_SAMPLES, np.float32), 0.0

def get_tiled_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding", "duration"])
    else: cache = pd.DataFrame(columns=["key", "embedding", "duration"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    durations = np.zeros(N, np.float32)
    
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    buf_segs, buf_keys, buf_durs = [], [], []
    new_entries = []
    
    for i, row in tqdm(need, desc="Extracting Audio (Acoustic Tiling)", leave=False):
        seg, dur = load_tiled_audio(row.audio_path)
        buf_segs.append(seg)
        buf_keys.append(keys[i])
        buf_durs.append(dur)
        
        if len(buf_segs) >= 64:
            embs = _infer_batch(buf_segs)
            new_entries.extend([{"key": k, "embedding": e.tolist(), "duration": d} for k, e, d in zip(buf_keys, embs, buf_durs)])
            buf_segs, buf_keys, buf_durs = [], [], []
            
    if buf_segs:
        embs = _infer_batch(buf_segs)
        new_entries.extend([{"key": k, "embedding": e.tolist(), "duration": d} for k, e, d in zip(buf_keys, embs, buf_durs)])
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], zip(cache["embedding"], cache["duration"])))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            emb_val, dur_val = cache_dict[k]
            embeddings[i] = np.array(emb_val, np.float32) if not isinstance(emb_val, np.ndarray) else emb_val
            durations[i]  = float(dur_val)
            
    return embeddings, durations

# ── 5. PREPROCESSING & OOF STACKING BUILDERS ────────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

# LEVEL 1 EXPERTS
def build_audio_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(n_estimators=300, learning_rate=0.03, num_leaves=31, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced", max_iter=2000)

def build_clinical_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(n_estimators=200, learning_rate=0.03, num_leaves=15, max_depth=4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced")

# LEVEL 2 SUPERVISOR (Non-Linear LightGBM with TRBL)
def build_supervisor(n_pos, n_neg):
    trbl_scale = (n_neg / max(n_pos, 1)) * 1.5 
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=100, learning_rate=0.03,
            num_leaves=7, max_depth=3,  # Very shallow to avoid overfitting meta-features
            min_child_samples=10,
            scale_pos_weight=trbl_scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight={0: 1.0, 1: trbl_scale}, max_iter=2000, random_state=SEED)

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\n" + "="*60)
print("3. STARTING V8.1 TRAINING (ACOUSTIC TILING + STACKING)")
print("="*60)

# Pre-fetch all embeddings once to print duration stats
print("[*] Pre-fetching audio to audit durations & tile...")
all_embs, all_durs = get_tiled_embeddings(cough_df)
print(f"    -> Audio Durations (seconds) | Min: {np.min(all_durs):.2f}s | Max: {np.max(all_durs):.2f}s | Mean: {np.mean(all_durs):.2f}s")

oof_stack = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    # Extract Smart Embeddings
    X_tr_emb, _ = get_tiled_embeddings(df_tr)
    X_val_emb, _ = get_tiled_embeddings(df_val)
    X_te_emb, _ = get_tiled_embeddings(df_te)
    
    # Preprocess Metadata
    meta_prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m = meta_prep.fit_transform(df_tr)
    X_val_m = meta_prep.transform(df_val)
    X_te_m = meta_prep.transform(df_te)
    
    # ── LEVEL 1: GENERATE OUT-OF-FOLD (OOF) PROBABILITIES FOR TRAIN ──
    print("[*] Generating Out-Of-Fold Probabilities...")
    cv_inner = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=SEED)
    inner_folds = list(cv_inner.split(df_tr, y_tr, df_tr["participant_id"]))
    
    tr_oof_a = np.zeros(len(y_tr))
    tr_oof_m = np.zeros(len(y_tr))
    
    for i_tr, i_val in inner_folds:
        clf_a_inner = build_audio_expert(int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum())).fit(X_tr_emb[i_tr], y_tr[i_tr])
        tr_oof_a[i_val] = clf_a_inner.predict_proba(X_tr_emb[i_val])[:, 1]
        
        clf_m_inner = build_clinical_expert(int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum())).fit(X_tr_m[i_tr], y_tr[i_tr])
        tr_oof_m[i_val] = clf_m_inner.predict_proba(X_tr_m[i_val])[:, 1]
        
    # ── LEVEL 1: TRAIN EXPERTS ON FULL TRAIN SET ──
    clf_a = build_audio_expert(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_emb, y_tr)
    val_prob_a = clf_a.predict_proba(X_val_emb)[:,1]
    te_prob_a = clf_a.predict_proba(X_te_emb)[:,1]
    
    clf_m = build_clinical_expert(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_m, y_tr)
    val_prob_m = clf_m.predict_proba(X_val_m)[:,1]
    te_prob_m = clf_m.predict_proba(X_te_m)[:,1]

    # ── LEVEL 2: THE NON-LINEAR SUPERVISOR ──
    X_tr_stack = np.column_stack([tr_oof_a, tr_oof_m, X_tr_m])
    X_val_stack = np.column_stack([val_prob_a, val_prob_m, X_val_m])
    X_te_stack = np.column_stack([te_prob_a, te_prob_m, X_te_m])
    
    supervisor = build_supervisor(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_stack, y_tr)
    cal_supervisor = CalibratedClassifierCV(supervisor, cv="prefit", method="sigmoid")
    cal_supervisor.fit(X_val_stack, y_val)
    
    te_prob_stack = cal_supervisor.predict_proba(X_te_stack)[:,1]
    oof_stack[te_idx] = te_prob_stack
    
    print(f"[*] Fold {fold_i+1} Supervisor ROC-AUC: {roc_auc_score(y_te, te_prob_stack):.4f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_stack"] = oof_stack

part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), prob_stack=("pred_stack", "max")
).reset_index()

m_stack = full_eval(cough_df['label'], oof_stack)
p_stack = full_eval(part_df['label'], part_df['prob_stack'])

plot_curves(cough_df['label'], oof_stack, f"{FUSION_OUT}/plots/sota_stacking", "V8.1 Stacking")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (recording)": f"{cough_m.get('roc_auc', 0):.4f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.4f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.4f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.4f}"
    }

summary_df = pd.DataFrame([make_row("V8.1 (Acoustic Tiling + LGBM Meta)", m_stack, p_stack)])

print("\n" + "="*85)
print("REPORT-READY SUMMARY (VERSION 8.1 - SOTA LIMIT BREAKER)")
print("="*85)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v8_1.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V8.1 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 9 - THE DEPLOYMENT CANDIDATE)
# Mirrored Tiling (No Artifacts) + OOF Stacking + Mean Probability Voting
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, brier_score_loss, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR = 16_000
WIN_SAMPLES = 32_000  # Google HeAR HARDCODED constraint (2.0s)
EMBED_DIM = 512
N_SPLITS = 5          
TARGET_SENS = [0.85, 0.90, 0.95]

# Output Directories (V9)
OUT_ROOT = "/kaggle/working/outputs_v9"
FUSION_OUT = os.path.join(OUT_ROOT, "mirrored_oof_stacking")
CACHE_DIR = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

HEAR_VERSION = "google/hear-v1"
EMBED_CACHE  = os.path.join(CACHE_DIR, "hear_mirrored_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular","confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols], on="participant_id", how="left")

lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

print(f"[*] Total valid audio files mapped: {len(cough_df)}")
print(f"[*] Total unique participants: {cough_df['participant_id'].nunique()}")

# ── 3. STRATIFIED GROUP K-FOLD ──────────────────────────────────────────────
print("\n[*] Building Custom Stratified Group K-Folds...")
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(cough_df, cough_df["label"], cough_df["participant_id"]))

# ── 4. ACOUSTIC REFLECTION (ZERO-ARTIFACT TILING) ───────────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL & EXTRACTING AUDIO")
print("="*60)
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("[*] ✓ HeAR loaded successfully")
except Exception as e:
    print(f"[*] ⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments):
    if HEAR_SERVING is None: return np.zeros((len(segments), EMBED_DIM), np.float32)
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

def load_reflected_audio(path):
    """Uses Mirrored Tiling to extend 0.5s audio to 2.0s without seam artifacts."""
    try:
        audio, sr = librosa.load(str(path), sr=SR, mono=True)
        dur = len(audio) / sr
        
        if len(audio) == 0:
            return np.zeros(WIN_SAMPLES, np.float32), 0.0
            
        if len(audio) < WIN_SAMPLES:
            # MIRRORED TILING (Acoustic Reflection):
            # Normal tiling [1, 2, 3] -> [1, 2, 3 | 1, 2, 3] creates a harsh "click" at 3->1.
            # Mirrored tiling [1, 2, 3] -> [1, 2, 3, 3, 2, 1 | 1, 2, 3] ensures perfect amplitude continuity!
            audio_mirrored = np.concatenate((audio, audio[::-1]))
            repeats = int(np.ceil(WIN_SAMPLES / len(audio_mirrored)))
            audio = np.tile(audio_mirrored, repeats)[:WIN_SAMPLES]
        else:
            # Smart Energy Peak Detection for files > 2s
            frame_len = 400; hop = 160
            frames = librosa.util.frame(audio, frame_length=frame_len, hop_length=hop)
            rms = np.sqrt(np.mean(frames**2, axis=0))
            smooth_n = max(1, int(0.2 * sr / hop))
            rms_smooth = np.convolve(rms, np.ones(smooth_n)/smooth_n, mode="same")
            peak_fr = int(np.argmax(rms_smooth))
            center = peak_fr * hop + frame_len // 2
            
            start = max(0, center - WIN_SAMPLES // 2)
            end = start + WIN_SAMPLES
            if end > len(audio):
                end = len(audio); start = max(0, len(audio) - WIN_SAMPLES)
            audio = audio[start:end]
            
        return audio, dur
    except:
        return np.zeros(WIN_SAMPLES, np.float32), 0.0

def get_reflected_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try: cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key", "embedding", "duration"])
    else: cache = pd.DataFrame(columns=["key", "embedding", "duration"])

    N = len(df_rows)
    embeddings = np.zeros((N, EMBED_DIM), np.float32)
    durations = np.zeros(N, np.float32)
    
    keys = [hashlib.md5(f"{HEAR_VERSION}::{r.audio_path}".encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]
    
    buf_segs, buf_keys, buf_durs = [], [], []
    new_entries = []
    
    for i, row in tqdm(need, desc="Extracting Audio (Mirrored)", leave=False):
        seg, dur = load_reflected_audio(row.audio_path)
        buf_segs.append(seg)
        buf_keys.append(keys[i])
        buf_durs.append(dur)
        
        if len(buf_segs) >= 64:
            embs = _infer_batch(buf_segs)
            new_entries.extend([{"key": k, "embedding": e.tolist(), "duration": d} for k, e, d in zip(buf_keys, embs, buf_durs)])
            buf_segs, buf_keys, buf_durs = [], [], []
            
    if buf_segs:
        embs = _infer_batch(buf_segs)
        new_entries.extend([{"key": k, "embedding": e.tolist(), "duration": d} for k, e, d in zip(buf_keys, embs, buf_durs)])
            
    if new_entries:
        cache = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        cache["key"] = cache["key"].astype(str)
        cache.to_parquet(EMBED_CACHE, index=False)
        
    cache_dict = dict(zip(cache["key"], zip(cache["embedding"], cache["duration"])))
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            emb_val, dur_val = cache_dict[k]
            embeddings[i] = np.array(emb_val, np.float32) if not isinstance(emb_val, np.ndarray) else emb_val
            durations[i]  = float(dur_val)
            
    return embeddings, durations

# ── 5. PREPROCESSING & OOF STACKING BUILDERS ────────────────────────────────
def build_meta_preprocessor(num_cols, cat_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_cols))
    if cat_cols:
        transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols))
    return ColumnTransformer(transformers, remainder="drop")

# LEVEL 1 EXPERTS
def build_audio_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        # Added colsample_bytree to force LightGBM to generalize across the 512 dimensions
        return lgb.LGBMClassifier(
            n_estimators=300, learning_rate=0.03, num_leaves=31, 
            colsample_bytree=0.3, scale_pos_weight=scale, 
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced", max_iter=2000)

def build_clinical_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(n_estimators=200, learning_rate=0.03, num_leaves=15, max_depth=4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced")

# LEVEL 2 SUPERVISOR (Non-Linear LightGBM with TRBL)
def build_supervisor(n_pos, n_neg):
    trbl_scale = (n_neg / max(n_pos, 1)) * 1.5 
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=100, learning_rate=0.03,
            num_leaves=7, max_depth=3,
            min_child_samples=10,
            scale_pos_weight=trbl_scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight={0: 1.0, 1: trbl_scale}, max_iter=2000, random_state=SEED)

# ── EVALUATION HELPERS ──────────────────────────────────────────────────────
def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), "sensitivity": tp/(tp+fn+1e-9), "specificity": tn/(tn+fp+1e-9)}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true))>1 else np.nan}
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob); auc = roc_auc_score(y_true, y_prob)
    fig, ax = plt.subplots(figsize=(5,4)); ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1); ax.set(title=f"{title_prefix} ROC"); ax.legend()
    fig.tight_layout(); fig.savefig(f"{path_prefix}_roc.png", dpi=150); plt.close(fig)

# ── 6. TRAINING & EVALUATION LOOP ───────────────────────────────────────────
print("\n" + "="*60)
print("3. STARTING V9 TRAINING (MIRRORED AUDIO + MEAN POOLING)")
print("="*60)

print("[*] Pre-fetching audio to audit durations & apply acoustic reflection...")
all_embs, all_durs = get_reflected_embeddings(cough_df)

oof_stack = np.zeros(len(cough_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    
    df_tr_full = cough_df.iloc[tr_idx].reset_index(drop=True)
    df_te      = cough_df.iloc[te_idx].reset_index(drop=True)
    
    val_split_idx = int(len(df_tr_full) * 0.8)
    df_tr, df_val = df_tr_full.iloc[:val_split_idx], df_tr_full.iloc[val_split_idx:]
    
    y_tr, y_val, y_te = df_tr["label"].values, df_val["label"].values, df_te["label"].values
    
    # Extract Mirrored Embeddings
    X_tr_emb, _ = get_reflected_embeddings(df_tr)
    X_val_emb, _ = get_reflected_embeddings(df_val)
    X_te_emb, _ = get_reflected_embeddings(df_te)
    
    # Preprocess Metadata
    meta_prep = build_meta_preprocessor(num_cols, cat_cols)
    X_tr_m = meta_prep.fit_transform(df_tr)
    X_val_m = meta_prep.transform(df_val)
    X_te_m = meta_prep.transform(df_te)
    
    # ── LEVEL 1: GENERATE OUT-OF-FOLD (OOF) PROBABILITIES FOR TRAIN ──
    print("[*] Generating Out-Of-Fold Probabilities...")
    cv_inner = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=SEED)
    inner_folds = list(cv_inner.split(df_tr, y_tr, df_tr["participant_id"]))
    
    tr_oof_a = np.zeros(len(y_tr))
    tr_oof_m = np.zeros(len(y_tr))
    
    for i_tr, i_val in inner_folds:
        clf_a_inner = build_audio_expert(int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum())).fit(X_tr_emb[i_tr], y_tr[i_tr])
        tr_oof_a[i_val] = clf_a_inner.predict_proba(X_tr_emb[i_val])[:, 1]
        
        clf_m_inner = build_clinical_expert(int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum())).fit(X_tr_m[i_tr], y_tr[i_tr])
        tr_oof_m[i_val] = clf_m_inner.predict_proba(X_tr_m[i_val])[:, 1]
        
    # ── LEVEL 1: TRAIN EXPERTS ON FULL TRAIN SET ──
    clf_a = build_audio_expert(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_emb, y_tr)
    val_prob_a = clf_a.predict_proba(X_val_emb)[:,1]
    te_prob_a = clf_a.predict_proba(X_te_emb)[:,1]
    
    clf_m = build_clinical_expert(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_m, y_tr)
    val_prob_m = clf_m.predict_proba(X_val_m)[:,1]
    te_prob_m = clf_m.predict_proba(X_te_m)[:,1]

    # ── LEVEL 2: THE NON-LINEAR SUPERVISOR ──
    X_tr_stack = np.column_stack([tr_oof_a, tr_oof_m, X_tr_m])
    X_val_stack = np.column_stack([val_prob_a, val_prob_m, X_val_m])
    X_te_stack = np.column_stack([te_prob_a, te_prob_m, X_te_m])
    
    supervisor = build_supervisor(int(y_tr.sum()), int((y_tr==0).sum())).fit(X_tr_stack, y_tr)
    cal_supervisor = CalibratedClassifierCV(supervisor, cv="prefit", method="sigmoid")
    cal_supervisor.fit(X_val_stack, y_val)
    
    te_prob_stack = cal_supervisor.predict_proba(X_te_stack)[:,1]
    oof_stack[te_idx] = te_prob_stack
    
    print(f"[*] Fold {fold_i+1} Supervisor ROC-AUC: {roc_auc_score(y_te, te_prob_stack):.4f}")

# ── 7. FINAL SCORES & REPORTING ─────────────────────────────────────────────
cough_df["pred_stack"] = oof_stack

# ----------------------------------------------------------------------------
# THE SPECIFICITY FIX: Use 'mean' instead of 'max' to prevent noisy outliers 
# from destroying healthy patient predictions!
# ----------------------------------------------------------------------------
part_df = cough_df.groupby("participant_id").agg(
    label=("label", "first"), 
    prob_stack=("pred_stack", "mean")  
).reset_index()

m_stack = full_eval(cough_df['label'], oof_stack)
p_stack = full_eval(part_df['label'], part_df['prob_stack'])

plot_curves(cough_df['label'], oof_stack, f"{FUSION_OUT}/plots/sota_stacking", "V9 Stacking")

def make_row(name, cough_m, part_m):
    return {
        "Model": name,
        "ROC-AUC (recording)": f"{cough_m.get('roc_auc', 0):.4f}",
        "ROC-AUC (participant)": f"{part_m.get('roc_auc', 0):.4f}",
        "Sens@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('sensitivity',0):.4f}",
        "Spec@90%": f"{cough_m.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.4f}"
    }

summary_df = pd.DataFrame([make_row("V9 (Mirrored Audio + Mean Voting)", m_stack, p_stack)])

print("\n" + "="*90)
print("REPORT-READY SUMMARY (VERSION 9 - DEPLOYMENT CANDIDATE)")
print("="*90)
print(summary_df.to_string(index=False))

zip_path = "/kaggle/working/outputs_v9.zip"
with zipfile.ZipFile(zip_path,"w",zipfile.ZIP_DEFLATED) as zf:
    for root,_,files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root,fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))
print(f"\n✅ All V9 Results Zipped to: {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
# ============================================================================
# TB SCREENING RANKER — CODA-TB DATASET (VERSION 10 - RESEARCH CANDIDATE)
# ============================================================================
# KEY IMPROVEMENTS OVER V9:
#
#  1. MULTI-WINDOW EMBEDDING: Every cough recording is sliced into ALL possible
#     2-second windows (with 50% overlap), each embedded by HeAR, then
#     aggregated (mean + std + [25,50,75] percentiles = 5x512 = 2560-dim feature).
#     This replaces the single-window "energy peak" approach that discards signal.
#
#  2. PARTICIPANT-LEVEL CLASSIFICATION (no late-fusion): All cough embeddings
#     from a participant are aggregated BEFORE classification, not after.
#     The model sees a 2560-dim "acoustic fingerprint" per participant,
#     combined with clinical meta. This avoids noisy recording-level predictions.
#
#  3. PARTIAL AUC (pAUC) AS OBJECTIVE: The CODA challenge metric is specificity
#     at ≥90% sensitivity. We add a pAUC metric (TPR in [0.85, 1.0]) and tune
#     all thresholds against it.
#
#  4. COUNTRY AS EXPLICIT FEATURE: AUROC varies dramatically by country in this
#     dataset (0.63-0.81 in the challenge). Country is now a hard-coded feature.
#
#  5. EMBEDDING NOISE AUGMENTATION: Gaussian noise injection on HeAR embeddings
#     during training (σ=0.01) provides regularization without touching audio.
#
#  6. FIXED CALIBRATION SPLIT: Uses a proper StratifiedGroupKFold inner split
#     for calibration data, not a naive sequential iloc slice.
#
#  7. COUGH COUNT AS FEATURE: Number of valid cough segments per participant
#     is included as an explicit clinical feature.
#
#  8. CLEANER CACHE: Cache key now includes a content hash of the audio path
#     only (no version string that can silently go stale).
# ============================================================================

import os, sys, json, warnings, random, hashlib, zipfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt
from scipy import stats as sp_stats

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             f1_score, confusion_matrix, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION ────────────────────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR          = 16_000
WIN_SAMPLES = 32_000   # 2s @ 16kHz — HeAR hard constraint
HOP_SAMPLES = 16_000   # 50% overlap for multi-window extraction
EMBED_DIM   = 512
N_SPLITS    = 5
TARGET_SENS = [0.85, 0.90, 0.95]
PAUC_LOW    = 0.85     # pAUC window: specificity-at-sensitivity >= PAUC_LOW

# Aggregation stats per participant (mean + std + 3 percentiles = 5 vectors)
AGG_FUNCS   = ["mean", "std", "p25", "p50", "p75"]
AGG_DIM     = EMBED_DIM * len(AGG_FUNCS)  # 2560

# Embedding augmentation noise (Gaussian, applied only during training)
EMB_NOISE_STD = 0.01

# Output Directories
OUT_ROOT   = "/kaggle/working/outputs_v10"
FUSION_OUT = os.path.join(OUT_ROOT, "multiwindow_participant")
CACHE_DIR  = os.path.join(OUT_ROOT, "cache")
for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots"]:
    os.makedirs(d, exist_ok=True)

EMBED_CACHE = os.path.join(CACHE_DIR, "hear_multiwindow_embeddings.parquet")

# ── 2. DATA LOADING & MERGING ───────────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio    = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

# ── Clinical feature selection (exclude post-diagnostic leakage) ─────────────
POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular",
                "confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []
for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

# Ensure country column is included as categorical (high-value feature)
country_col = None
for hint in ["country", "site", "country_id", "collection_country"]:
    matches = [c for c in df_clinical.columns if hint in c.lower()]
    if matches:
        country_col = matches[0]
        if country_col not in cat_cols:
            cat_cols.append(country_col)
        break

if country_col:
    print(f"[*] Country feature found: '{country_col}'")
else:
    print("[!] WARNING: No country column found. Country is a top-3 feature for TB!")

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols],
                          on="participant_id", how="left")

# ── Audio file mapping ────────────────────────────────────────────────────────
lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(
    lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

print(f"[*] Total valid audio files: {len(cough_df)}")
print(f"[*] Total unique participants: {cough_df['participant_id'].nunique()}")
print(f"[*] Num clinical features — numerical: {len(num_cols)}, categorical: {len(cat_cols)}")

# ── 3. HeAR MODEL LOADING ────────────────────────────────────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL")
print("="*60)
try:
    from kaggle_secrets import UserSecretsClient
    from huggingface_hub import login, from_pretrained_keras
    import tensorflow as tf
    _sec = UserSecretsClient()
    login(token=_sec.get_secret("HF_TOKEN"))
    HEAR_MODEL   = from_pretrained_keras("google/hear")
    HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]
    print("[*] ✓ HeAR loaded successfully")
except Exception as e:
    print(f"[*] ⚠ HeAR load failed: {e}")
    HEAR_SERVING = None

def _infer_batch(segments: list) -> np.ndarray:
    """Run HeAR on a batch of 2s segments. Returns (N, 512) float32."""
    if HEAR_SERVING is None:
        return np.zeros((len(segments), EMBED_DIM), np.float32)
    import tensorflow as tf
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

# ── 4. MULTI-WINDOW AUDIO LOADING ────────────────────────────────────────────
def load_audio(path: str):
    """Load audio, return float32 waveform at SR."""
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        return audio
    except:
        return np.zeros(WIN_SAMPLES, np.float32)

def extract_windows(audio: np.ndarray) -> list:
    """
    Slice audio into all WIN_SAMPLES windows with HOP_SAMPLES stride.
    Short clips: mirrored-tile to WIN_SAMPLES, yielding exactly 1 window.
    Long clips: sliding window with 50% overlap — captures all cough events.
    """
    if len(audio) == 0:
        return [np.zeros(WIN_SAMPLES, np.float32)]

    if len(audio) < WIN_SAMPLES:
        # Mirrored tiling (no click artifact at boundaries)
        audio_mir = np.concatenate((audio, audio[::-1]))
        repeats   = int(np.ceil(WIN_SAMPLES / len(audio_mir)))
        audio     = np.tile(audio_mir, repeats)[:WIN_SAMPLES]
        return [audio.astype(np.float32)]

    windows = []
    start   = 0
    while start + WIN_SAMPLES <= len(audio):
        windows.append(audio[start:start + WIN_SAMPLES].astype(np.float32))
        start += HOP_SAMPLES

    # Always include the tail window to capture the end of long coughs
    if start < len(audio):
        tail_start = len(audio) - WIN_SAMPLES
        windows.append(audio[tail_start:].astype(np.float32))

    return windows if windows else [audio[:WIN_SAMPLES].astype(np.float32)]

def aggregate_embeddings(emb_matrix: np.ndarray) -> np.ndarray:
    """
    Aggregate (N_windows, 512) embeddings into a single (2560,) vector.
    Stats: mean, std, 25th, 50th, 75th percentile across windows.
    This is much richer than a single-window embedding.
    """
    if emb_matrix.shape[0] == 1:
        # Single window: pad std/percentiles with zeros (no variance info)
        e = emb_matrix[0]
        return np.concatenate([e, np.zeros(EMBED_DIM * 4, np.float32)])
    m   = emb_matrix.mean(axis=0)
    s   = emb_matrix.std(axis=0)
    p25 = np.percentile(emb_matrix, 25, axis=0)
    p50 = np.percentile(emb_matrix, 50, axis=0)
    p75 = np.percentile(emb_matrix, 75, axis=0)
    return np.concatenate([m, s, p25, p50, p75]).astype(np.float32)

# ── 5. EMBEDDING EXTRACTION WITH CACHING ─────────────────────────────────────
print("\n" + "="*60)
print("3. EXTRACTING MULTI-WINDOW HeAR EMBEDDINGS")
print("="*60)

def get_multiwindow_embeddings(df_rows: pd.DataFrame):
    """
    Returns:
      agg_embeddings: (N_rows, AGG_DIM=2560)  — per-recording aggregated embedding
      n_windows_arr:  (N_rows,)               — number of windows extracted per recording
    """
    if os.path.exists(EMBED_CACHE):
        try:    cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key","agg_embedding","n_windows"])
    else:
        cache = pd.DataFrame(columns=["key","agg_embedding","n_windows"])

    N   = len(df_rows)
    agg = np.zeros((N, AGG_DIM), np.float32)
    nw  = np.zeros(N, np.int32)

    # Use path-only hash (no version string that can silently go stale)
    keys         = [hashlib.md5(str(r.audio_path).encode()).hexdigest()
                    for _, r in df_rows.iterrows()]
    cached_keys  = set(cache["key"].tolist()) if not cache.empty else set()
    need         = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows())
                    if keys[i] not in cached_keys]

    # Process in batches across all windows from multiple files
    BATCH = 64
    buf_segs, buf_meta = [], []  # meta = (row_idx, key, window_idx_for_file, total_for_file)

    new_entries = {}  # key -> {"segments_emb": list_of_embs, "n_windows": int}

    for i, row in tqdm(need, desc="Loading audio", leave=False):
        audio   = load_audio(row.audio_path)
        windows = extract_windows(audio)
        k       = keys[i]
        new_entries[k] = {"n_windows": len(windows), "embs": []}

        for seg in windows:
            buf_segs.append(seg)
            buf_meta.append(k)

            if len(buf_segs) >= BATCH:
                batch_embs = _infer_batch(buf_segs)
                for idx, (bk, be) in enumerate(zip(buf_meta, batch_embs)):
                    new_entries[bk]["embs"].append(be)
                buf_segs, buf_meta = [], []

    if buf_segs:
        batch_embs = _infer_batch(buf_segs)
        for bk, be in zip(buf_meta, batch_embs):
            new_entries[bk]["embs"].append(be)

    # Aggregate and save new entries
    new_rows = []
    for k, v in new_entries.items():
        emb_mat = np.stack(v["embs"])           # (n_windows, 512)
        agg_emb = aggregate_embeddings(emb_mat) # (2560,)
        new_rows.append({"key": k,
                         "agg_embedding": agg_emb.tolist(),
                         "n_windows": v["n_windows"]})

    if new_rows:
        cache = pd.concat([cache, pd.DataFrame(new_rows)], ignore_index=True)
        cache.to_parquet(EMBED_CACHE, index=False)

    cache_dict = {row["key"]: row for _, row in cache.iterrows()}
    for i in range(N):
        k = keys[i]
        if k in cache_dict:
            r       = cache_dict[k]
            agg_val = r["agg_embedding"]
            agg[i]  = np.array(agg_val, np.float32) if not isinstance(agg_val, np.ndarray) else agg_val
            nw[i]   = int(r["n_windows"])

    return agg, nw

# Extract all embeddings upfront
print("[*] Pre-fetching all audio embeddings (multi-window)...")
all_agg_embs, all_n_windows = get_multiwindow_embeddings(cough_df)
cough_df["n_cough_windows"] = all_n_windows

print(f"[*] Embedding shape per recording: ({AGG_DIM},)")
print(f"[*] Avg windows per recording: {all_n_windows.mean():.1f}  "
      f"(max={all_n_windows.max()}, min={all_n_windows.min()})")

# ── 6. PARTICIPANT-LEVEL AGGREGATION ─────────────────────────────────────────
# Pool all per-recording embeddings into one fingerprint per participant.
# This is the key architectural insight: the model should reason about
# a participant, not individual recordings.
print("\n[*] Aggregating to participant level...")

participant_ids  = cough_df["participant_id"].values
unique_pids      = cough_df["participant_id"].unique()

# Build a participant-level DataFrame
pid_records = []
for pid in unique_pids:
    mask  = participant_ids == pid
    p_embs = all_agg_embs[mask]   # (n_recordings, 2560)
    label  = cough_df.loc[mask, "label"].values[0]
    n_recs = mask.sum()

    # Average across recordings for this participant
    p_mean_emb = p_embs.mean(axis=0)  # (2560,)

    # Get clinical features from first row (they're participant-level)
    first_row = cough_df.loc[mask].iloc[0]
    rec = {"participant_id": pid, "label": label, "n_recordings": n_recs}
    for col in num_cols + cat_cols:
        rec[col] = first_row.get(col, np.nan)
    rec["n_cough_windows_total"] = int(cough_df.loc[mask, "n_cough_windows"].sum())
    pid_records.append((rec, p_mean_emb))

participant_df  = pd.DataFrame([r for r, _ in pid_records]).reset_index(drop=True)
participant_embs = np.stack([e for _, e in pid_records])  # (N_participants, 2560)

print(f"[*] Participant-level dataset: {len(participant_df)} participants, "
      f"{participant_df['label'].sum()} TB+, {(participant_df['label']==0).sum()} TB-")

# Add n_recordings and n_cough_windows_total to numerical features
extra_num = ["n_recordings", "n_cough_windows_total"]
num_cols_p = num_cols + extra_num

# ── 7. CROSS-VALIDATION SETUP ────────────────────────────────────────────────
print("\n" + "="*60)
print("4. STARTING V10 TRAINING (PARTICIPANT-LEVEL + MULTI-WINDOW)")
print("="*60)

# Folds are at participant level (no group constraint needed — already deduplicated)
sgkf  = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(participant_df, participant_df["label"]))

# ── 8. PREPROCESSING & MODEL BUILDERS ────────────────────────────────────────
def build_meta_preprocessor(num_c, cat_c):
    transformers = []
    if num_c:
        transformers.append(("num", Pipeline([
            ("imp", SimpleImputer(strategy="median", add_indicator=True)),
            ("sc",  StandardScaler())
        ]), num_c))
    if cat_c:
        transformers.append(("cat", Pipeline([
            ("imp", SimpleImputer(strategy="constant", fill_value="Not_Available")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_c))
    return ColumnTransformer(transformers, remainder="drop")

def add_embedding_noise(X: np.ndarray, noise_std: float = EMB_NOISE_STD) -> np.ndarray:
    """Gaussian noise injection for embedding augmentation (training only)."""
    return X + np.random.normal(0, noise_std, X.shape).astype(np.float32)

def build_audio_expert(n_pos, n_neg):
    """LightGBM over aggregated multi-window HeAR embeddings (2560-dim)."""
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=400,
            learning_rate=0.02,
            num_leaves=31,
            colsample_bytree=0.4,    # important: 2560 features, need regularization
            subsample=0.8,
            min_child_samples=5,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced", max_iter=3000)

def build_clinical_expert(n_pos, n_neg):
    """LightGBM over clinical + demographic features."""
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=200,
            learning_rate=0.02,
            num_leaves=15,
            max_depth=4,
            min_child_samples=5,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight="balanced")

def build_supervisor(n_pos, n_neg):
    """
    Shallow meta-learner over stacked [audio_prob, clinical_prob, clinical_features].
    Deliberately constrained to prevent overfitting on small N.
    """
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB:
        return lgb.LGBMClassifier(
            n_estimators=100,
            learning_rate=0.02,
            num_leaves=7,
            max_depth=3,
            min_child_samples=8,
            scale_pos_weight=scale,
            random_state=SEED, verbose=-1, n_jobs=-1
        )
    return LogisticRegression(class_weight={0: 1.0, 1: scale}, max_iter=2000, random_state=SEED)

# ── 9. EVALUATION HELPERS ────────────────────────────────────────────────────
def partial_auc(y_true, y_prob, low_tpr=PAUC_LOW):
    """
    Partial AUC: area under ROC curve where TPR (sensitivity) >= low_tpr.
    Normalized to [0, 1] by dividing by the max possible area (1 - low_tpr).
    This is the primary metric aligned with the CODA challenge objective.
    """
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    # We want the region where TPR >= low_tpr
    mask  = tpr >= low_tpr
    if mask.sum() < 2:
        return 0.0
    # Interpolate the FPR at exactly tpr=low_tpr
    sub_fpr = fpr[mask]
    sub_tpr = tpr[mask]
    area    = float(np.trapz(sub_tpr, sub_fpr))
    # Area is negative because fpr is typically increasing -> abs
    area    = abs(area)
    max_area = (1.0 - low_tpr)
    return area / max_area if max_area > 0 else 0.0

def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {
        "threshold":   float(t),
        "accuracy":    float(accuracy_score(y_true, y_pred)),
        "sensitivity": tp / (tp + fn + 1e-9),
        "specificity": tn / (tn + fp + 1e-9),
        "f1":          float(f1_score(y_true, y_pred, zero_division=0))
    }

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec = m["specificity"]; best_t = t
    return float(best_t)

def full_eval(y_true, y_prob):
    y_true = np.array(y_true); y_prob = np.array(y_prob)
    m = {
        "roc_auc":  float(roc_auc_score(y_true, y_prob)) if len(np.unique(y_true)) > 1 else np.nan,
        "pauc_85":  partial_auc(y_true, y_prob, low_tpr=0.85),
        "pauc_90":  partial_auc(y_true, y_prob, low_tpr=0.90),
    }
    m["tuned_thresholds"] = {}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {
            "threshold": t, **metrics_at_thresh(y_true, y_prob, t)
        }
    return m

def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc = roc_auc_score(y_true, y_prob)
    fig, axes = plt.subplots(1, 2, figsize=(11, 4))

    # Full ROC
    ax = axes[0]
    ax.plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={auc:.3f}")
    ax.plot([0,1],[0,1],"--",color="gray",lw=1)
    ax.set(title=f"{title_prefix} — Full ROC", xlabel="FPR", ylabel="TPR")
    ax.legend()

    # Partial AUC zoom (high sensitivity region)
    ax = axes[1]
    mask = tpr >= 0.85
    ax.fill_between(fpr[mask], tpr[mask], 0.85, alpha=0.25, color="#457b9d",
                    label=f"pAUC@85%={partial_auc(y_true,y_prob,0.85):.3f}")
    ax.plot(fpr, tpr, color="#e63946", lw=2)
    ax.set_xlim([0, 1]); ax.set_ylim([0.8, 1.0])
    ax.set(title="Partial AUC (TPR≥85%)", xlabel="FPR", ylabel="TPR")
    ax.legend(fontsize=8)

    fig.tight_layout()
    fig.savefig(f"{path_prefix}_roc.png", dpi=150)
    plt.close(fig)

# ── 10. MAIN TRAINING LOOP ───────────────────────────────────────────────────
oof_stack = np.zeros(len(participant_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")

    df_tr_full   = participant_df.iloc[tr_idx].reset_index(drop=True)
    df_te        = participant_df.iloc[te_idx].reset_index(drop=True)
    emb_tr_full  = participant_embs[tr_idx]
    emb_te       = participant_embs[te_idx]

    y_tr_full    = df_tr_full["label"].values
    y_te         = df_te["label"].values

    # ── Calibration split: 20% of train, stratified ──────────────────────────
    cal_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    tr_sub_idx, cal_idx = next(cal_fold.split(df_tr_full, y_tr_full))

    df_tr     = df_tr_full.iloc[tr_sub_idx].reset_index(drop=True)
    df_cal    = df_tr_full.iloc[cal_idx].reset_index(drop=True)
    emb_tr    = emb_tr_full[tr_sub_idx]
    emb_cal   = emb_tr_full[cal_idx]
    y_tr      = df_tr["label"].values
    y_cal     = df_cal["label"].values

    print(f"[*] Train: {len(y_tr)} ({y_tr.sum()} TB+)  "
          f"| Cal: {len(y_cal)} ({y_cal.sum()} TB+)  "
          f"| Test: {len(y_te)} ({y_te.sum()} TB+)")

    # ── Preprocess clinical features ──────────────────────────────────────────
    meta_prep  = build_meta_preprocessor(num_cols_p, cat_cols)
    X_tr_m     = meta_prep.fit_transform(df_tr)
    X_cal_m    = meta_prep.transform(df_cal)
    X_te_m     = meta_prep.transform(df_te)

    # ── Embedding noise augmentation (training only) ──────────────────────────
    X_tr_emb   = add_embedding_noise(emb_tr)
    X_cal_emb  = emb_cal   # no augmentation at calibration/test time
    X_te_emb   = emb_te

    # ── LEVEL 1: Inner OOF for stacking meta-features ────────────────────────
    print("[*]  Generating inner OOF probabilities...")
    inner_cv   = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
    tr_oof_a   = np.zeros(len(y_tr))
    tr_oof_m   = np.zeros(len(y_tr))

    for i_tr, i_val in inner_cv.split(X_tr_emb, y_tr):
        n_p  = int(y_tr[i_tr].sum()); n_n = int((y_tr[i_tr]==0).sum())

        # Audio expert (inner)
        clf_a_i = build_audio_expert(n_p, n_n)
        X_aug   = add_embedding_noise(X_tr_emb[i_tr])   # augment inner too
        clf_a_i.fit(X_aug, y_tr[i_tr])
        tr_oof_a[i_val] = clf_a_i.predict_proba(X_tr_emb[i_val])[:, 1]

        # Clinical expert (inner)
        clf_m_i = build_clinical_expert(n_p, n_n)
        clf_m_i.fit(X_tr_m[i_tr], y_tr[i_tr])
        tr_oof_m[i_val] = clf_m_i.predict_proba(X_tr_m[i_val])[:, 1]

    # ── LEVEL 1: Full experts on all train data ───────────────────────────────
    n_pos = int(y_tr.sum()); n_neg = int((y_tr==0).sum())

    clf_a = build_audio_expert(n_pos, n_neg)
    clf_a.fit(X_tr_emb, y_tr)
    cal_prob_a = clf_a.predict_proba(X_cal_emb)[:, 1]
    te_prob_a  = clf_a.predict_proba(X_te_emb)[:, 1]

    clf_m = build_clinical_expert(n_pos, n_neg)
    clf_m.fit(X_tr_m, y_tr)
    cal_prob_m = clf_m.predict_proba(X_cal_m)[:, 1]
    te_prob_m  = clf_m.predict_proba(X_te_m)[:, 1]

    # ── LEVEL 2: Supervisor ───────────────────────────────────────────────────
    X_tr_stack  = np.column_stack([tr_oof_a, tr_oof_m, X_tr_m])
    X_cal_stack = np.column_stack([cal_prob_a, cal_prob_m, X_cal_m])
    X_te_stack  = np.column_stack([te_prob_a, te_prob_m, X_te_m])

    supervisor  = build_supervisor(n_pos, n_neg)
    supervisor.fit(X_tr_stack, y_tr)

    # Platt scaling calibration on the held-out calibration fold
    cal_supervisor = CalibratedClassifierCV(supervisor, cv="prefit", method="sigmoid")
    cal_supervisor.fit(X_cal_stack, y_cal)

    te_prob_stack       = cal_supervisor.predict_proba(X_te_stack)[:, 1]
    oof_stack[te_idx]   = te_prob_stack

    fold_auc  = roc_auc_score(y_te, te_prob_stack)
    fold_pauc = partial_auc(y_te, te_prob_stack, low_tpr=0.90)
    print(f"[*] Fold {fold_i+1} | AUC={fold_auc:.4f} | pAUC@90%={fold_pauc:.4f}")

# ── 11. FINAL REPORTING ──────────────────────────────────────────────────────
print("\n" + "="*90)
print("5. FINAL EVALUATION (V10 — PARTICIPANT-LEVEL)")
print("="*90)

participant_df["pred_stack"] = oof_stack

m_part = full_eval(participant_df["label"], participant_df["pred_stack"])

plot_curves(participant_df["label"], participant_df["pred_stack"],
            f"{FUSION_OUT}/plots/v10_participant", "V10 Participant-Level")

def make_row(name, pm):
    tt = pm.get("tuned_thresholds", {})
    return {
        "Model":        name,
        "ROC-AUC":      f"{pm.get('roc_auc', 0):.4f}",
        "pAUC@85%":     f"{pm.get('pauc_85', 0):.4f}",
        "pAUC@90%":     f"{pm.get('pauc_90', 0):.4f}",
        "Spec@Sens=85%": f"{tt.get('sens_85',{}).get('specificity',0):.4f}",
        "Spec@Sens=90%": f"{tt.get('sens_90',{}).get('specificity',0):.4f}",
        "Spec@Sens=95%": f"{tt.get('sens_95',{}).get('specificity',0):.4f}",
    }

summary_df = pd.DataFrame([make_row("V10 (Multi-Window + Participant-Level)", m_part)])

print(summary_df.to_string(index=False))
print()

# Threshold summary
for sk, sv in m_part.get("tuned_thresholds", {}).items():
    print(f"  [{sk}] threshold={sv['threshold']:.4f}  "
          f"sens={sv['sensitivity']:.3f}  spec={sv['specificity']:.3f}  "
          f"f1={sv['f1']:.3f}")

# Save results
summary_df.to_csv(f"{FUSION_OUT}/v10_summary.csv", index=False)
participant_df.to_csv(f"{FUSION_OUT}/v10_oof_predictions.csv", index=False)

zip_path = "/kaggle/working/outputs_v10.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUT_ROOT):
        for fn in files:
            fp = os.path.join(root, fn)
            zf.write(fp, os.path.relpath(fp, "/kaggle/working"))

print(f"\n✅ All V10 Results → {zip_path}")
print("PIPELINE COMPLETE")

In [None]:
"""
=====================================================================================
TB SCREENING RANKER: MASTER TRAINING & EXPORT PIPELINE (VERSION 10)
=====================================================================================

DESCRIPTION:
This script is the final, production-ready build of the CODA-TB screening pipeline. 
It trains decoupled models (Audio and Metadata) using an Out-of-Fold (OOF) Stacking 
architecture. It rigorously evaluates the models using Partial AUC and automatically 
exports all weights, plots, and evaluation metrics for cloud deployment.

KEY FEATURES IN V10:
1. Robust Multi-Window Audio: Dynamically slices audio of any length into overlapping 
   2-second windows, extracting a rich 2560-dim acoustic fingerprint via Google HeAR.
2. Decoupled Experts: Trains independent LightGBM models for Audio and Clinical Metadata.
3. Master Supervisor: Fuses the independent probabilities to mathematically rank patients.
4. Deployment Ready: Trains a final 100%-data master model and exports all `.pkl` 
   weights alongside a downloadable ZIP archive.
=====================================================================================
"""

import os, sys, json, warnings, random, hashlib, zipfile, shutil
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import FileLink, display

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, accuracy_score,
                             f1_score, confusion_matrix, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION & DIRECTORY SETUP ─────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR          = 16_000
WIN_SAMPLES = 32_000   # 2s @ 16kHz — HeAR hard constraint
HOP_SAMPLES = 16_000   # 50% overlap for multi-window extraction
EMBED_DIM   = 512
N_SPLITS    = 5
TARGET_SENS = [0.85, 0.90, 0.95]
PAUC_LOW    = 0.85     

# Aggregation stats per participant 
AGG_FUNCS   = ["mean", "std", "p25", "p50", "p75"]
AGG_DIM     = EMBED_DIM * len(AGG_FUNCS)  # 2560
EMB_NOISE_STD = 0.01

# Output Directories
OUT_ROOT   = "/kaggle/working/outputs_v10"
FUSION_OUT = os.path.join(OUT_ROOT, "multiwindow_participant")
CACHE_DIR  = os.path.join(OUT_ROOT, "cache")
MODEL_DIR  = os.path.join(OUT_ROOT, "models")
EVAL_DIR   = os.path.join(OUT_ROOT, "eval")

for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots", MODEL_DIR, EVAL_DIR]:
    os.makedirs(d, exist_ok=True)

EMBED_CACHE = os.path.join(CACHE_DIR, "hear_multiwindow_embeddings.parquet")

# ── 2. DATA LOADING & HARMONISATION ─────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio    = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

# Clinical feature selection
POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular",
                "confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols = [], []
for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols: continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

country_col = None
for hint in ["country", "site", "country_id", "collection_country"]:
    matches = [c for c in df_clinical.columns if hint in c.lower()]
    if matches:
        country_col = matches[0]
        if country_col not in cat_cols: cat_cols.append(country_col)
        break

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols],
                          on="participant_id", how="left")

# Audio file mapping
lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(
    lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

# ── 3. HeAR MODEL LOADING WITH KAGGLE AUTHENTICATION ───────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL (GATED REPO FIX)")
print("="*60)
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login, from_pretrained_keras
import tensorflow as tf

try:
    _sec = UserSecretsClient()
    hf_token = _sec.get_secret("HF_TOKEN")
    login(token=hf_token)
except Exception as e:
    raise ValueError("Could not find HF_TOKEN in Kaggle Secrets.") from e

HEAR_MODEL   = from_pretrained_keras("google/hear")
HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]

def _infer_batch(segments: list) -> np.ndarray:
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

# ── 4. AUDIO PROCESSING & EMBEDDING ──────────────────────────────────────────
def load_audio(path: str):
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        return audio
    except:
        return np.zeros(WIN_SAMPLES, np.float32)

def extract_windows(audio: np.ndarray) -> list:
    if len(audio) == 0: return [np.zeros(WIN_SAMPLES, np.float32)]
    if len(audio) < WIN_SAMPLES:
        audio_mir = np.concatenate((audio, audio[::-1]))
        repeats   = int(np.ceil(WIN_SAMPLES / len(audio_mir)))
        return [np.tile(audio_mir, repeats)[:WIN_SAMPLES].astype(np.float32)]
    windows = []
    start   = 0
    while start + WIN_SAMPLES <= len(audio):
        windows.append(audio[start:start + WIN_SAMPLES].astype(np.float32))
        start += HOP_SAMPLES
    if start < len(audio):
        windows.append(audio[len(audio) - WIN_SAMPLES:].astype(np.float32))
    return windows if windows else [audio[:WIN_SAMPLES].astype(np.float32)]

def aggregate_embeddings(emb_matrix: np.ndarray) -> np.ndarray:
    if emb_matrix.shape[0] == 1:
        return np.concatenate([emb_matrix[0], np.zeros(EMBED_DIM * 4, np.float32)])
    m, s = emb_matrix.mean(axis=0), emb_matrix.std(axis=0)
    p25, p50, p75 = np.percentile(emb_matrix, [25, 50, 75], axis=0)
    return np.concatenate([m, s, p25, p50, p75]).astype(np.float32)

print("\n[*] Pre-fetching multi-window HeAR embeddings...")
def get_multiwindow_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try:    cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key","agg_embedding","n_windows"])
    else: cache = pd.DataFrame(columns=["key","agg_embedding","n_windows"])

    N = len(df_rows)
    agg = np.zeros((N, AGG_DIM), np.float32)
    nw  = np.zeros(N, np.int32)
    keys = [hashlib.md5(str(r.audio_path).encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]

    BATCH = 64
    buf_segs, buf_meta, new_entries = [], [], {}

    for i, row in tqdm(need, desc="Extracting embeddings", leave=False):
        windows = extract_windows(load_audio(row.audio_path))
        k = keys[i]
        new_entries[k] = {"n_windows": len(windows), "embs": []}
        for seg in windows:
            buf_segs.append(seg); buf_meta.append(k)
            if len(buf_segs) >= BATCH:
                batch_embs = _infer_batch(buf_segs)
                for bk, be in zip(buf_meta, batch_embs): new_entries[bk]["embs"].append(be)
                buf_segs, buf_meta = [], []
    if buf_segs:
        batch_embs = _infer_batch(buf_segs)
        for bk, be in zip(buf_meta, batch_embs): new_entries[bk]["embs"].append(be)

    new_rows = []
    for k, v in new_entries.items():
        new_rows.append({"key": k, "agg_embedding": aggregate_embeddings(np.stack(v["embs"])).tolist(), "n_windows": v["n_windows"]})
    
    if new_rows:
        cache = pd.concat([cache, pd.DataFrame(new_rows)], ignore_index=True)
        cache.to_parquet(EMBED_CACHE, index=False)

    cache_dict = {row["key"]: row for _, row in cache.iterrows()}
    for i in range(N):
        if keys[i] in cache_dict:
            r = cache_dict[keys[i]]
            val = r["agg_embedding"]
            agg[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
            nw[i]  = int(r["n_windows"])
    return agg, nw

all_agg_embs, cough_df["n_cough_windows"] = get_multiwindow_embeddings(cough_df)

# Participant Aggregation
print("\n[*] Aggregating to participant level...")
unique_pids = cough_df["participant_id"].unique()
pid_records = []
for pid in unique_pids:
    mask = cough_df["participant_id"] == pid
    first_row = cough_df.loc[mask].iloc[0]
    rec = {"participant_id": pid, "label": cough_df.loc[mask, "label"].values[0], 
           "n_recordings": mask.sum(), "n_cough_windows_total": int(cough_df.loc[mask, "n_cough_windows"].sum())}
    for col in num_cols + cat_cols: rec[col] = first_row.get(col, np.nan)
    pid_records.append((rec, all_agg_embs[mask].mean(axis=0)))

participant_df = pd.DataFrame([r for r, _ in pid_records]).reset_index(drop=True)
participant_embs = np.stack([e for _, e in pid_records])
num_cols_p = num_cols + ["n_recordings", "n_cough_windows_total"]

# ── 5. MODEL BUILDERS & HELPERS ──────────────────────────────────────────────
def build_meta_preprocessor(num_c, cat_c):
    return ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_c),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Missing")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_c)
    ], remainder="drop")

def add_embedding_noise(X): return X + np.random.normal(0, EMB_NOISE_STD, X.shape).astype(np.float32)

def build_audio_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB: return lgb.LGBMClassifier(n_estimators=400, learning_rate=0.02, num_leaves=31, colsample_bytree=0.4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced", max_iter=3000)

def build_clinical_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB: return lgb.LGBMClassifier(n_estimators=200, learning_rate=0.02, num_leaves=15, max_depth=4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced")

def build_supervisor(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB: return lgb.LGBMClassifier(n_estimators=100, learning_rate=0.02, num_leaves=7, max_depth=3, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight={0: 1.0, 1: scale}, max_iter=2000, random_state=SEED)

# Fixed Partial AUC Formulation
def partial_auc(y_true, y_prob, low_tpr=PAUC_LOW):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    mask = tpr >= low_tpr
    if mask.sum() < 2: return 0.0
    sub_fpr, sub_tpr = fpr[mask], tpr[mask]
    area = float(np.trapz(sub_tpr, sub_fpr))
    # Correctly bounds area between 0.0 and 1.0 for the FPR domain measured
    max_area = max(1e-9, 1.0 * (sub_fpr[-1] - sub_fpr[0])) 
    return area / max_area 

def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), 
            "sensitivity": tp / (tp + fn + 1e-9), "specificity": tn / (tn + fp + 1e-9), 
            "f1": float(f1_score(y_true, y_pred, zero_division=0))}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec, best_t = m["specificity"], t
    return float(best_t)

def full_eval(y_true, y_prob):
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)), 
         "pauc_85": partial_auc(y_true, y_prob, 0.85), "pauc_90": partial_auc(y_true, y_prob, 0.90), "tuned_thresholds": {}}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

# Plotting Helpers 
def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    fig, axes = plt.subplots(1, 2, figsize=(11, 4))
    axes[0].plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={roc_auc_score(y_true, y_prob):.3f}")
    axes[0].plot([0,1],[0,1],"--",color="gray",lw=1)
    axes[0].set(title=f"{title_prefix} — Full ROC", xlabel="FPR", ylabel="TPR")
    axes[0].legend()
    mask = tpr >= 0.85
    axes[1].fill_between(fpr[mask], tpr[mask], 0.85, alpha=0.25, color="#457b9d", label=f"pAUC@85%={partial_auc(y_true,y_prob,0.85):.3f}")
    axes[1].plot(fpr, tpr, color="#e63946", lw=2)
    axes[1].set(xlim=[0, 1], ylim=[0.8, 1.0], title="Partial AUC (TPR≥85%)", xlabel="FPR", ylabel="TPR")
    axes[1].legend(fontsize=8)
    fig.tight_layout()
    fig.savefig(f"{path_prefix}_roc.png", dpi=150)
    plt.close(fig)

def plot_confusion_matrix_sns(y_true, y_prob, path, threshold=0.5, title="Confusion Matrix"):
    y_pred = (np.array(y_prob) >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax,
                xticklabels=['TB Negative', 'TB Positive'], yticklabels=['TB Negative', 'TB Positive'])
    ax.set_title(title, fontweight='bold')
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    plt.close(fig)

# ── 6. MAIN TRAINING & FOLD EXPORT LOOP ──────────────────────────────────────
print("\n" + "="*60)
print("4. STARTING V10 TRAINING & WEIGHT EXPORT")
print("="*60)

sgkf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(participant_df, participant_df["label"]))
oof_stack = np.zeros(len(participant_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    df_tr_full, df_te = participant_df.iloc[tr_idx].reset_index(drop=True), participant_df.iloc[te_idx].reset_index(drop=True)
    emb_tr_full, emb_te = participant_embs[tr_idx], participant_embs[te_idx]
    y_tr_full, y_te = df_tr_full["label"].values, df_te["label"].values

    cal_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    tr_sub_idx, cal_idx = next(cal_fold.split(df_tr_full, y_tr_full))

    df_tr, df_cal = df_tr_full.iloc[tr_sub_idx].reset_index(drop=True), df_tr_full.iloc[cal_idx].reset_index(drop=True)
    emb_tr, emb_cal = emb_tr_full[tr_sub_idx], emb_tr_full[cal_idx]
    y_tr, y_cal = df_tr["label"].values, df_cal["label"].values

    meta_prep = build_meta_preprocessor(num_cols_p, cat_cols)
    X_tr_m = meta_prep.fit_transform(df_tr)
    X_cal_m, X_te_m = meta_prep.transform(df_cal), meta_prep.transform(df_te)
    X_tr_emb, X_cal_emb, X_te_emb = add_embedding_noise(emb_tr), emb_cal, emb_te

    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
    tr_oof_a, tr_oof_m = np.zeros(len(y_tr)), np.zeros(len(y_tr))

    for i_tr, i_val in inner_cv.split(X_tr_emb, y_tr):
        n_p, n_n = int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum())
        clf_a_i = build_audio_expert(n_p, n_n)
        clf_a_i.fit(add_embedding_noise(X_tr_emb[i_tr]), y_tr[i_tr])
        tr_oof_a[i_val] = clf_a_i.predict_proba(X_tr_emb[i_val])[:, 1]
        
        clf_m_i = build_clinical_expert(n_p, n_n)
        clf_m_i.fit(X_tr_m[i_tr], y_tr[i_tr])
        tr_oof_m[i_val] = clf_m_i.predict_proba(X_tr_m[i_val])[:, 1]

    n_pos, n_neg = int(y_tr.sum()), int((y_tr==0).sum())
    clf_a = build_audio_expert(n_pos, n_neg).fit(X_tr_emb, y_tr)
    clf_m = build_clinical_expert(n_pos, n_neg).fit(X_tr_m, y_tr)

    X_tr_stack  = np.column_stack([tr_oof_a, tr_oof_m, X_tr_m])
    X_cal_stack = np.column_stack([clf_a.predict_proba(X_cal_emb)[:,1], clf_m.predict_proba(X_cal_m)[:,1], X_cal_m])
    X_te_stack  = np.column_stack([clf_a.predict_proba(X_te_emb)[:,1], clf_m.predict_proba(X_te_m)[:,1], X_te_m])

    supervisor = build_supervisor(n_pos, n_neg).fit(X_tr_stack, y_tr)
    cal_supervisor = CalibratedClassifierCV(supervisor, cv="prefit", method="sigmoid").fit(X_cal_stack, y_cal)

    te_prob_stack = cal_supervisor.predict_proba(X_te_stack)[:, 1]
    oof_stack[te_idx] = te_prob_stack
    print(f"[*] Fold {fold_i+1} | AUC={roc_auc_score(y_te, te_prob_stack):.4f} | pAUC@90%={partial_auc(y_te, te_prob_stack, 0.90):.4f}")

    # EXPORTING PER-FOLD WEIGHTS
    fold_tag = f"fold{fold_i+1}"
    joblib.dump(clf_a, os.path.join(MODEL_DIR, f"{fold_tag}_audio_expert.pkl"))
    joblib.dump(clf_m, os.path.join(MODEL_DIR, f"{fold_tag}_clinical_expert.pkl"))
    joblib.dump(supervisor, os.path.join(MODEL_DIR, f"{fold_tag}_supervisor.pkl"))
    joblib.dump(cal_supervisor, os.path.join(MODEL_DIR, f"{fold_tag}_calibrated_supervisor.pkl"))
    joblib.dump(meta_prep, os.path.join(MODEL_DIR, f"{fold_tag}_meta_preprocessor.pkl"))

# ── 7. FINAL FULL-DATA MODEL DEPLOYMENT & EVALUATION ─────────────────────────
print("\n" + "="*60)
print("5. SAVING FINAL DEPLOYMENT MODEL & ARTIFACTS")
print("="*60)

participant_df["pred_stack"] = oof_stack
m_part = full_eval(participant_df["label"], participant_df["pred_stack"])

# Save evaluation metrics as JSON / text
with open(os.path.join(EVAL_DIR, "v10_metrics.json"), "w") as f: json.dump(m_part, f, indent=2)

summary_rows = [{"Model": "V10 (Multi-Window + Participant-Level)", "ROC-AUC": f"{m_part.get('roc_auc', 0):.4f}", 
                 "pAUC@85%": f"{m_part.get('pauc_85', 0):.4f}", "pAUC@90%": f"{m_part.get('pauc_90', 0):.4f}",
                 "Spec@Sens=85%": f"{m_part.get('tuned_thresholds',{}).get('sens_85',{}).get('specificity',0):.4f}",
                 "Spec@Sens=90%": f"{m_part.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.4f}",
                 "Spec@Sens=95%": f"{m_part.get('tuned_thresholds',{}).get('sens_95',{}).get('specificity',0):.4f}"}]
summary_df = pd.DataFrame(summary_rows)

with open(os.path.join(EVAL_DIR, "v10_metrics_readable.txt"), "w") as f:
    f.write("V10 FINAL EVALUATION\n" + "="*50 + "\n" + summary_df.to_string(index=False) + "\n\n")
    for sk, sv in m_part.get("tuned_thresholds", {}).items():
        f.write(f"[{sk}] threshold={sv['threshold']:.4f}  sens={sv['sensitivity']:.3f}  spec={sv['specificity']:.3f}  f1={sv['f1']:.3f}\n")

# Save Visual Plots
plot_curves(participant_df["label"], participant_df["pred_stack"], f"{FUSION_OUT}/plots/v10_participant", "V10 Participant-Level")
best_t_90 = m_part["tuned_thresholds"]["sens_90"]["threshold"]
plot_confusion_matrix_sns(participant_df["label"], participant_df["pred_stack"], 
                          f"{FUSION_OUT}/plots/v10_participant_cm.png", threshold=best_t_90, title=f"V10 CM (Sens ≥ 90%)")

# Train ONE final deployable model on 100% of data
print("[*] Training final full-data models for deployment...")
df_all, y_all, X_all_emb = participant_df.copy().reset_index(drop=True), participant_df["label"].values, participant_embs.copy()

cal_fold_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
tr_sub_idx_final, cal_idx_final = next(cal_fold_final.split(df_all, y_all))
df_tr_final, df_cal_final = df_all.iloc[tr_sub_idx_final].reset_index(drop=True), df_all.iloc[cal_idx_final].reset_index(drop=True)
X_tr_emb_final, X_cal_emb_final = X_all_emb[tr_sub_idx_final], X_all_emb[cal_idx_final]
y_tr_final, y_cal_final = df_tr_final["label"].values, df_cal_final["label"].values

meta_prep_final = build_meta_preprocessor(num_cols_p, cat_cols)
X_tr_m_final, X_cal_m_final, X_all_m_final = meta_prep_final.fit_transform(df_tr_final), meta_prep_final.transform(df_cal_final), meta_prep_final.transform(df_all)

X_tr_emb_final_noisy, X_cal_emb_final_clean, X_all_emb_clean = add_embedding_noise(X_tr_emb_final), X_cal_emb_final, X_all_emb
inner_cv_final = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
tr_oof_a_final, tr_oof_m_final = np.zeros(len(y_tr_final)), np.zeros(len(y_tr_final))

for i_tr, i_val in inner_cv_final.split(X_tr_emb_final_noisy, y_tr_final):
    n_p_i, n_n_i = int(y_tr_final[i_tr].sum()), int((y_tr_final[i_tr] == 0).sum())
    clf_a_i = build_audio_expert(n_p_i, n_n_i).fit(add_embedding_noise(X_tr_emb_final_noisy[i_tr]), y_tr_final[i_tr])
    tr_oof_a_final[i_val] = clf_a_i.predict_proba(X_tr_emb_final_noisy[i_val])[:, 1]
    clf_m_i = build_clinical_expert(n_p_i, n_n_i).fit(X_tr_m_final[i_tr], y_tr_final[i_tr])
    tr_oof_m_final[i_val] = clf_m_i.predict_proba(X_tr_m_final[i_val])[:, 1]

n_pos_all, n_neg_all = int(y_tr_final.sum()), int((y_tr_final == 0).sum())
clf_a_final = build_audio_expert(n_pos_all, n_neg_all).fit(X_tr_emb_final_noisy, y_tr_final)
clf_m_final = build_clinical_expert(n_pos_all, n_neg_all).fit(X_tr_m_final, y_tr_final)

X_tr_stack_final  = np.column_stack([tr_oof_a_final, tr_oof_m_final, X_tr_m_final])
X_cal_stack_final = np.column_stack([clf_a_final.predict_proba(X_cal_emb_final_clean)[:, 1], clf_m_final.predict_proba(X_cal_m_final)[:, 1], X_cal_m_final])

supervisor_final = build_supervisor(n_pos_all, n_neg_all).fit(X_tr_stack_final, y_tr_final)
cal_supervisor_final = CalibratedClassifierCV(supervisor_final, cv="prefit", method="sigmoid").fit(X_cal_stack_final, y_cal_final)

# EXPORTING MASTER DEPLOYMENT WEIGHTS
joblib.dump(meta_prep_final,        os.path.join(MODEL_DIR, "final_meta_preprocessor.pkl"))
joblib.dump(clf_a_final,            os.path.join(MODEL_DIR, "final_audio_expert.pkl"))
joblib.dump(clf_m_final,            os.path.join(MODEL_DIR, "final_clinical_expert.pkl"))
joblib.dump(supervisor_final,       os.path.join(MODEL_DIR, "final_supervisor.pkl"))
joblib.dump(cal_supervisor_final,   os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl"))

deploy_info = {
    "model_name": "TB SCREENING RANKER V10",
    "seed": SEED, "sr": SR, "win_samples": WIN_SAMPLES, "hop_samples": HOP_SAMPLES,
    "embed_dim": EMBED_DIM, "agg_funcs": AGG_FUNCS, "agg_dim": AGG_DIM,
    "num_cols_p": num_cols_p, "cat_cols": cat_cols,
    "target_sens_thresholds": {k: float(v["threshold"]) for k, v in m_part.get("tuned_thresholds", {}).items()}
}
with open(os.path.join(MODEL_DIR, "final_inference_config.json"), "w") as f: json.dump(deploy_info, f, indent=2)

summary_df.to_csv(f"{FUSION_OUT}/v10_summary.csv", index=False)
participant_df.to_csv(f"{FUSION_OUT}/v10_oof_predictions.csv", index=False)

# ZIP EVERYTHING FOR DOWNLOAD
zip_path = "/kaggle/working/outputs_v10.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUT_ROOT):
        for fn in files: zf.write(os.path.join(root, fn), os.path.relpath(os.path.join(root, fn), "/kaggle/working"))

print("\n✅ V10 PIPELINE COMPLETE & ZIPPED!")
print("\n--- DOWNLOAD LINKS ---")
display(FileLink(zip_path))
display(FileLink(f"{FUSION_OUT}/plots/v10_participant_roc.png"))
display(FileLink(f"{FUSION_OUT}/plots/v10_participant_cm.png"))
display(FileLink(os.path.join(EVAL_DIR, "v10_metrics_readable.txt")))
display(FileLink(os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl")))

In [None]:
import os
# Must be set to JAX BEFORE importing keras
os.environ["KERAS_BACKEND"] = "jax"
# Cap JAX memory so it doesn't crash Pandas/LightGBM
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.85" 

import joblib
import numpy as np
import pandas as pd
import keras
import keras_hub

# 1. CRITICAL: Force Keras to use half-precision. 
# This shrinks MedGemma 4B from 16GB to 8GB so it fits on a Kaggle T4!
keras.config.set_floatx("float16")

# ============================================================================
# 2. SETUP: LOAD THE V10 ML MODELS
# ============================================================================
print("Loading V10 ML Models...")
MODEL_DIR = "/kaggle/input/datasets/jroot888/medgemma-trained-weights/outputs_v10/models"

meta_prep      = joblib.load(os.path.join(MODEL_DIR, "final_meta_preprocessor.pkl"))
clf_a          = joblib.load(os.path.join(MODEL_DIR, "final_audio_expert.pkl"))
clf_m          = joblib.load(os.path.join(MODEL_DIR, "final_clinical_expert.pkl"))
cal_supervisor = joblib.load(os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl"))

# ============================================================================
# 3. SIMULATE A WAITING ROOM (Standalone Version)
# ============================================================================
print("Simulating Clinic Waiting Room...")
# Load actual clinical data from your V10 output CSV
csv_path = "/kaggle/input/datasets/jroot888/medgemma-trained-weights/outputs_v10/multiwindow_participant/v10_oof_predictions.csv"
df_all = pd.read_csv(csv_path)
df_waiting_room = df_all.sample(10, random_state=99).reset_index(drop=True)

# Generate dummy HeAR embeddings (2560-dim) just for this UI/Agent test
# (In your real deployment later, this will come from the live microphone audio)
X_audio_waiting_room = np.random.randn(10, 2560).astype(np.float32)

# ============================================================================
# 4. THE RANKING ENGINE
# ============================================================================
def rank_patients(df_meta, X_audio):
    X_m_processed = meta_prep.transform(df_meta)
    
    prob_a = clf_a.predict_proba(X_audio)[:, 1]
    prob_m = clf_m.predict_proba(X_m_processed)[:, 1]
    
    X_stack = np.column_stack([prob_a, prob_m, X_m_processed])
    final_scores = cal_supervisor.predict_proba(X_stack)[:, 1]
    
    desired_cols = ['participant_id', 'age', 'sex', 'weight_loss', 'night_sweats']
    display_cols = [c for c in desired_cols if c in df_meta.columns]
    
    df_queue = df_meta[display_cols].copy()
    df_queue['audio_risk']  = np.round(prob_a, 3)
    df_queue['clinic_risk'] = np.round(prob_m, 3)
    df_queue['final_triage_score'] = np.round(final_scores, 3)
    
    df_queue = df_queue.sort_values(by='final_triage_score', ascending=False).reset_index(drop=True)
    df_queue.index = df_queue.index + 1 
    return df_queue

ranked_queue = rank_patients(df_waiting_room, X_audio_waiting_room)
print("\n--- SORTED TRIAGE QUEUE ---")
print(ranked_queue[['participant_id', 'audio_risk', 'clinic_risk', 'final_triage_score']])

# ============================================================================
# 5. LOAD MEDGEMMA 4B (Memory-Optimized)
# ============================================================================
print("\nLoading MedGemma 4B into GPU... (This takes a minute)")
MEDGEMMA_PATH = "/kaggle/input/models/keras/medgemma/keras/medgemma_4b/1"

try:
    # dtype="float16" combined with set_floatx prevents the OOM crash
    medgemma = keras_hub.models.CausalLM.from_preset(MEDGEMMA_PATH, dtype="float16")
    medgemma.compile(sampler="greedy") 
    print("✓ MedGemma 4B Loaded Successfully!")
except Exception as e:
    print(f"⚠ Warning: MedGemma failed to load.\nError: {e}")
    medgemma = None 

# ============================================================================
# 6. EXECUTE THE AGENTIC WORKFLOW
# ============================================================================
def generate_clinical_justification(patient_row):
    age = patient_row.get('age', 'Unknown')
    sex = patient_row.get('sex', 'Unknown')
    weight_loss = patient_row.get('weight_loss', 'Unknown')
    night_sweats = patient_row.get('night_sweats', 'Unknown')
    
    prompt = (
        f"You are an expert AI Triage Assistant in a tuberculosis clinic. "
        f"You have limited GeneXpert tests available today.\n\n"
        f"PATIENT DATA:\n"
        f"- Demographics: {age} year old {sex}.\n"
        f"- Symptoms: Weight loss is {weight_loss}, Night Sweats is {night_sweats}.\n"
        f"- AI Assessment: Acoustic Cough Risk Score is {patient_row['audio_risk']} out of 1.0. "
        f"Clinical Risk Score is {patient_row['clinic_risk']} out of 1.0.\n\n"
        f"TASK: Write a concise, professional 2-sentence clinical justification explaining why this patient "
        f"has been assigned a high priority triage rank for a GeneXpert test today.\n\n"
        f"JUSTIFICATION:\n"
    )
    
    if medgemma is None: return "LLM Not Loaded."
    
    response = medgemma.generate(prompt, max_length=256)
    justification = response.replace(prompt, "").strip()
    return justification

print("\n--- MEDGEMMA CLINICAL EXPLANATIONS FOR TOP 3 PATIENTS ---")
for rank, patient in ranked_queue.head(3).iterrows():
    print(f"\nEvaluating Rank #{rank} (ID: {patient.get('participant_id', 'Unknown')})")
    print("Generating LLM Justification...")
    explanation = generate_clinical_justification(patient)
    print(f"MEDGEMMA OUTPUT:\n{explanation}")

In [1]:
"""
=====================================================================================
TB SCREENING RANKER: MASTER TRAINING & EXPORT PIPELINE (VERSION 10)
=====================================================================================

DESCRIPTION:
This script is the final, production-ready build of the CODA-TB screening pipeline. 
It trains decoupled models (Audio and Metadata) using an Out-of-Fold (OOF) Stacking 
architecture. It rigorously evaluates the models using Partial AUC and automatically 
exports all weights, plots, and evaluation metrics for cloud deployment.

KEY FEATURES IN V10:
1. Robust Multi-Window Audio: Dynamically slices audio of any length into overlapping 
   2-second windows, extracting a rich 2560-dim acoustic fingerprint via Google HeAR.
2. Decoupled Experts: Trains independent LightGBM models for Audio and Clinical Metadata.
3. Master Supervisor: Fuses the independent probabilities to mathematically rank patients.
4. Deployment Ready: Trains a final 100%-data master model and exports all `.pkl` 
   weights alongside a downloadable ZIP archive.
=====================================================================================
"""

import os, sys, json, warnings, random, hashlib, zipfile, shutil
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import FileLink, display

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import sklearn, librosa, joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, accuracy_score,
                             f1_score, confusion_matrix, roc_curve)

try:
    import lightgbm as lgb; HAS_LGB = True
except ImportError:
    HAS_LGB = False

# ── 1. CONFIGURATION & DIRECTORY SETUP ─────────────────────────────────────────
BASE       = "/kaggle/input/tb-audio/Tuberculosis"
META       = f"{BASE}/metadata"
AUDIO_BASE = f"{BASE}/raw_data/solicited_data"

CLINICAL_CSV  = f"{META}/CODA_TB_Clinical_Meta_Info.csv"
SOLICITED_CSV = f"{META}/CODA_TB_Solicited_Meta_Info.csv"

SR          = 16_000
WIN_SAMPLES = 32_000   # 2s @ 16kHz — HeAR hard constraint
HOP_SAMPLES = 16_000   # 50% overlap for multi-window extraction
EMBED_DIM   = 512
N_SPLITS    = 5
TARGET_SENS = [0.85, 0.90, 0.95]
PAUC_LOW    = 0.85     

# Aggregation stats per participant 
AGG_FUNCS   = ["mean", "std", "p25", "p50", "p75"]
AGG_DIM     = EMBED_DIM * len(AGG_FUNCS)  # 2560
EMB_NOISE_STD = 0.01

# Output Directories
OUT_ROOT   = "/kaggle/working/outputs_v10"
FUSION_OUT = os.path.join(OUT_ROOT, "multiwindow_participant")
CACHE_DIR  = os.path.join(OUT_ROOT, "cache")
MODEL_DIR  = os.path.join(OUT_ROOT, "models")
EVAL_DIR   = os.path.join(OUT_ROOT, "eval")

for d in [FUSION_OUT, CACHE_DIR, f"{FUSION_OUT}/plots", MODEL_DIR, EVAL_DIR]:
    os.makedirs(d, exist_ok=True)

EMBED_CACHE = os.path.join(CACHE_DIR, "hear_multiwindow_embeddings.parquet")

# ── 2. DATA LOADING & HARMONISATION ─────────────────────────────────────────
print("\n" + "="*60)
print("1. LOADING & HARMONISING DATA")
print("="*60)

def harmonise_cols(df):
    rename = {}
    cols_lc = {c.lower(): c for c in df.columns}
    for hint in ["participant_id","participant","subject_id"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "participant_id"; break
    for hint in ["filename","file_name","audio_file","wav_file","cough_file"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "filename"; break
    for hint in ["tb_status","tb","label","target","tb_result"]:
        if hint in cols_lc: rename[cols_lc[hint]] = "label_raw"; break
    return df.rename(columns=rename)

def binarise_label(series):
    def _b(v):
        if pd.isna(v): return np.nan
        s = str(v).strip().lower()
        if s in ("1","yes","positive","tb+","tb_positive","true","pos"): return 1
        if s in ("0","no","negative","tb-","tb_negative","false","neg"): return 0
        try: return int(float(s))
        except: return np.nan
    return series.apply(_b)

df_audio    = harmonise_cols(pd.read_csv(SOLICITED_CSV))
df_clinical = harmonise_cols(pd.read_csv(CLINICAL_CSV))

if "label_raw" not in df_audio.columns and "label_raw" in df_clinical.columns:
    df_audio = df_audio.merge(df_clinical[["participant_id", "label_raw"]], on="participant_id", how="left")

df_audio["label"] = binarise_label(df_audio["label_raw"])
df_audio = df_audio.dropna(subset=["label"]).reset_index(drop=True)
df_audio["label"] = df_audio["label"].astype(int)

# Clinical feature selection & Auditing
POST_DIAG_KW = ["sputum","culture","smear","xpert","dst","microscopy","molecular",
                "confirmatory","tb_status","label"]
skip_cols = set(POST_DIAG_KW) | {"participant_id"}
num_cols, cat_cols, dropped_cols = [], [], []

print("\n" + "-"*50)
print("🔍 FEATURE AUDIT: METADATA & CLINICAL VARIABLES")
print("-"*50)
print(f"[*] Total Available Features in Metadata ({len(df_clinical.columns)}):")
print(list(df_clinical.columns))

for c in df_clinical.columns:
    if any(kw in c.lower() for kw in POST_DIAG_KW) or c in skip_cols:
        dropped_cols.append(c)
        continue
    if df_clinical[c].dtype in (np.float64, np.float32, np.int64, np.int32): num_cols.append(c)
    else: cat_cols.append(c)

print(f"\n[*] Excluded Features (Data Leakage/IDs) ({len(dropped_cols)}):")
print(dropped_cols)

country_col = None
for hint in ["country", "site", "country_id", "collection_country"]:
    matches = [c for c in df_clinical.columns if hint in c.lower()]
    if matches:
        country_col = matches[0]
        if country_col not in cat_cols: cat_cols.append(country_col)
        break

cough_df = df_audio.merge(df_clinical[["participant_id"] + num_cols + cat_cols],
                          on="participant_id", how="left")

# Audio file mapping
lookup = {}
for dirpath, _, fns in os.walk(AUDIO_BASE):
    for fn in fns:
        if fn.lower().endswith((".wav",".ogg",".flac",".mp3")):
            lookup[fn] = os.path.join(dirpath, fn)
            lookup[os.path.splitext(fn)[0]] = os.path.join(dirpath, fn)

cough_df["audio_path"] = cough_df["filename"].apply(
    lambda x: lookup.get(str(x), lookup.get(os.path.splitext(str(x))[0], np.nan)))
cough_df = cough_df.dropna(subset=["audio_path"]).reset_index(drop=True)

# ── 3. HeAR MODEL LOADING WITH KAGGLE AUTHENTICATION ───────────────────────
print("\n" + "="*60)
print("2. LOADING GOOGLE HeAR MODEL (GATED REPO FIX)")
print("="*60)
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login, from_pretrained_keras
import tensorflow as tf

try:
    _sec = UserSecretsClient()
    hf_token = _sec.get_secret("HF_TOKEN")
    login(token=hf_token)
except Exception as e:
    raise ValueError("Could not find HF_TOKEN in Kaggle Secrets.") from e

HEAR_MODEL   = from_pretrained_keras("google/hear")
HEAR_SERVING = HEAR_MODEL.signatures["serving_default"]

def _infer_batch(segments: list) -> np.ndarray:
    x = tf.constant(np.stack(segments), dtype=tf.float32)
    return list(HEAR_SERVING(x=x).values())[0].numpy().astype(np.float32)

# ── 4. AUDIO PROCESSING & EMBEDDING ──────────────────────────────────────────
def load_audio(path: str):
    try:
        audio, _ = librosa.load(str(path), sr=SR, mono=True)
        return audio
    except:
        return np.zeros(WIN_SAMPLES, np.float32)

def extract_windows(audio: np.ndarray) -> list:
    if len(audio) == 0: return [np.zeros(WIN_SAMPLES, np.float32)]
    if len(audio) < WIN_SAMPLES:
        audio_mir = np.concatenate((audio, audio[::-1]))
        repeats   = int(np.ceil(WIN_SAMPLES / len(audio_mir)))
        return [np.tile(audio_mir, repeats)[:WIN_SAMPLES].astype(np.float32)]
    windows = []
    start   = 0
    while start + WIN_SAMPLES <= len(audio):
        windows.append(audio[start:start + WIN_SAMPLES].astype(np.float32))
        start += HOP_SAMPLES
    if start < len(audio):
        windows.append(audio[len(audio) - WIN_SAMPLES:].astype(np.float32))
    return windows if windows else [audio[:WIN_SAMPLES].astype(np.float32)]

def aggregate_embeddings(emb_matrix: np.ndarray) -> np.ndarray:
    if emb_matrix.shape[0] == 1:
        return np.concatenate([emb_matrix[0], np.zeros(EMBED_DIM * 4, np.float32)])
    m, s = emb_matrix.mean(axis=0), emb_matrix.std(axis=0)
    p25, p50, p75 = np.percentile(emb_matrix, [25, 50, 75], axis=0)
    return np.concatenate([m, s, p25, p50, p75]).astype(np.float32)

print("\n[*] Pre-fetching multi-window HeAR embeddings...")
def get_multiwindow_embeddings(df_rows):
    if os.path.exists(EMBED_CACHE):
        try:    cache = pd.read_parquet(EMBED_CACHE)
        except: cache = pd.DataFrame(columns=["key","agg_embedding","n_windows"])
    else: cache = pd.DataFrame(columns=["key","agg_embedding","n_windows"])

    N = len(df_rows)
    agg = np.zeros((N, AGG_DIM), np.float32)
    nw  = np.zeros(N, np.int32)
    keys = [hashlib.md5(str(r.audio_path).encode()).hexdigest() for _, r in df_rows.iterrows()]
    cached_keys = set(cache["key"].tolist()) if not cache.empty else set()
    need = [(i, row) for i, (_, row) in enumerate(df_rows.iterrows()) if keys[i] not in cached_keys]

    BATCH = 64
    buf_segs, buf_meta, new_entries = [], [], {}

    for i, row in tqdm(need, desc="Extracting embeddings", leave=False):
        windows = extract_windows(load_audio(row.audio_path))
        k = keys[i]
        new_entries[k] = {"n_windows": len(windows), "embs": []}
        for seg in windows:
            buf_segs.append(seg); buf_meta.append(k)
            if len(buf_segs) >= BATCH:
                batch_embs = _infer_batch(buf_segs)
                for bk, be in zip(buf_meta, batch_embs): new_entries[bk]["embs"].append(be)
                buf_segs, buf_meta = [], []
    if buf_segs:
        batch_embs = _infer_batch(buf_segs)
        for bk, be in zip(buf_meta, batch_embs): new_entries[bk]["embs"].append(be)

    new_rows = []
    for k, v in new_entries.items():
        new_rows.append({"key": k, "agg_embedding": aggregate_embeddings(np.stack(v["embs"])).tolist(), "n_windows": v["n_windows"]})
    
    if new_rows:
        cache = pd.concat([cache, pd.DataFrame(new_rows)], ignore_index=True)
        cache.to_parquet(EMBED_CACHE, index=False)

    cache_dict = {row["key"]: row for _, row in cache.iterrows()}
    for i in range(N):
        if keys[i] in cache_dict:
            r = cache_dict[keys[i]]
            val = r["agg_embedding"]
            agg[i] = np.array(val, np.float32) if not isinstance(val, np.ndarray) else val
            nw[i]  = int(r["n_windows"])
    return agg, nw

all_agg_embs, cough_df["n_cough_windows"] = get_multiwindow_embeddings(cough_df)

# Participant Aggregation
print("\n[*] Aggregating to participant level...")
unique_pids = cough_df["participant_id"].unique()
pid_records = []
for pid in unique_pids:
    mask = cough_df["participant_id"] == pid
    first_row = cough_df.loc[mask].iloc[0]
    rec = {"participant_id": pid, "label": cough_df.loc[mask, "label"].values[0], 
           "n_recordings": mask.sum(), "n_cough_windows_total": int(cough_df.loc[mask, "n_cough_windows"].sum())}
    for col in num_cols + cat_cols: rec[col] = first_row.get(col, np.nan)
    pid_records.append((rec, all_agg_embs[mask].mean(axis=0)))

participant_df = pd.DataFrame([r for r, _ in pid_records]).reset_index(drop=True)
participant_embs = np.stack([e for _, e in pid_records])
num_cols_p = num_cols + ["n_recordings", "n_cough_windows_total"]

print(f"\n[*] Final USED Numerical Features ({len(num_cols_p)}):")
print(num_cols_p)
print(f"\n[*] Final USED Categorical Features ({len(cat_cols)}):")
print(cat_cols)
print("-" * 50)

# ── 5. MODEL BUILDERS & HELPERS ──────────────────────────────────────────────
def build_meta_preprocessor(num_c, cat_c):
    return ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median", add_indicator=True)), ("sc", StandardScaler())]), num_c),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="constant", fill_value="Missing")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_c)
    ], remainder="drop")

def add_embedding_noise(X): return X + np.random.normal(0, EMB_NOISE_STD, X.shape).astype(np.float32)

def build_audio_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB: return lgb.LGBMClassifier(n_estimators=400, learning_rate=0.02, num_leaves=31, colsample_bytree=0.4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced", max_iter=3000)

def build_clinical_expert(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB: return lgb.LGBMClassifier(n_estimators=200, learning_rate=0.02, num_leaves=15, max_depth=4, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight="balanced")

def build_supervisor(n_pos, n_neg):
    scale = n_neg / max(n_pos, 1)
    if HAS_LGB: return lgb.LGBMClassifier(n_estimators=100, learning_rate=0.02, num_leaves=7, max_depth=3, scale_pos_weight=scale, random_state=SEED, verbose=-1, n_jobs=-1)
    return LogisticRegression(class_weight={0: 1.0, 1: scale}, max_iter=2000, random_state=SEED)

# Fixed Partial AUC Formulation
def partial_auc(y_true, y_prob, low_tpr=PAUC_LOW):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    mask = tpr >= low_tpr
    if mask.sum() < 2: return 0.0
    sub_fpr, sub_tpr = fpr[mask], tpr[mask]
    area = float(np.trapz(sub_tpr, sub_fpr))
    # Correctly bounds area between 0.0 and 1.0 for the FPR domain measured
    max_area = max(1e-9, 1.0 * (sub_fpr[-1] - sub_fpr[0])) 
    return area / max_area 

def metrics_at_thresh(y_true, y_prob, t=0.5):
    y_pred = (np.array(y_prob) >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"threshold": float(t), "accuracy": float(accuracy_score(y_true, y_pred)), 
            "sensitivity": tp / (tp + fn + 1e-9), "specificity": tn / (tn + fp + 1e-9), 
            "f1": float(f1_score(y_true, y_pred, zero_division=0))}

def find_thresh_for_sens(y_true, y_prob, target):
    thresholds = np.sort(np.unique(np.round(y_prob, 4)))[::-1]
    best_t, best_spec = 0.0, 0.0
    for t in thresholds:
        m = metrics_at_thresh(y_true, y_prob, t)
        if m["sensitivity"] >= target and m["specificity"] >= best_spec:
            best_spec, best_t = m["specificity"], t
    return float(best_t)

def full_eval(y_true, y_prob):
    m = {"roc_auc": float(roc_auc_score(y_true, y_prob)), 
         "pauc_85": partial_auc(y_true, y_prob, 0.85), "pauc_90": partial_auc(y_true, y_prob, 0.90), "tuned_thresholds": {}}
    for ts in TARGET_SENS:
        t = find_thresh_for_sens(y_true, y_prob, ts)
        m["tuned_thresholds"][f"sens_{int(ts*100)}"] = {"threshold": t, **metrics_at_thresh(y_true, y_prob, t)}
    return m

# Plotting Helpers 
def plot_curves(y_true, y_prob, path_prefix, title_prefix):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    fig, axes = plt.subplots(1, 2, figsize=(11, 4))
    axes[0].plot(fpr, tpr, color="#e63946", lw=2, label=f"AUC={roc_auc_score(y_true, y_prob):.3f}")
    axes[0].plot([0,1],[0,1],"--",color="gray",lw=1)
    axes[0].set(title=f"{title_prefix} — Full ROC", xlabel="FPR", ylabel="TPR")
    axes[0].legend()
    mask = tpr >= 0.85
    axes[1].fill_between(fpr[mask], tpr[mask], 0.85, alpha=0.25, color="#457b9d", label=f"pAUC@85%={partial_auc(y_true,y_prob,0.85):.3f}")
    axes[1].plot(fpr, tpr, color="#e63946", lw=2)
    axes[1].set(xlim=[0, 1], ylim=[0.8, 1.0], title="Partial AUC (TPR≥85%)", xlabel="FPR", ylabel="TPR")
    axes[1].legend(fontsize=8)
    fig.tight_layout()
    fig.savefig(f"{path_prefix}_roc.png", dpi=150)
    plt.close(fig)

def plot_confusion_matrix_sns(y_true, y_prob, path, threshold=0.5, title="Confusion Matrix"):
    y_pred = (np.array(y_prob) >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax,
                xticklabels=['TB Negative', 'TB Positive'], yticklabels=['TB Negative', 'TB Positive'])
    ax.set_title(title, fontweight='bold')
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    plt.close(fig)

# ── 6. MAIN TRAINING & FOLD EXPORT LOOP ──────────────────────────────────────
print("\n" + "="*60)
print("4. STARTING V10 TRAINING & WEIGHT EXPORT")
print("="*60)

sgkf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
folds = list(sgkf.split(participant_df, participant_df["label"]))
oof_stack = np.zeros(len(participant_df))

for fold_i, (tr_idx, te_idx) in enumerate(folds):
    print(f"\n--- FOLD {fold_i+1}/{N_SPLITS} ---")
    df_tr_full, df_te = participant_df.iloc[tr_idx].reset_index(drop=True), participant_df.iloc[te_idx].reset_index(drop=True)
    emb_tr_full, emb_te = participant_embs[tr_idx], participant_embs[te_idx]
    y_tr_full, y_te = df_tr_full["label"].values, df_te["label"].values

    cal_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    tr_sub_idx, cal_idx = next(cal_fold.split(df_tr_full, y_tr_full))

    df_tr, df_cal = df_tr_full.iloc[tr_sub_idx].reset_index(drop=True), df_tr_full.iloc[cal_idx].reset_index(drop=True)
    emb_tr, emb_cal = emb_tr_full[tr_sub_idx], emb_tr_full[cal_idx]
    y_tr, y_cal = df_tr["label"].values, df_cal["label"].values

    meta_prep = build_meta_preprocessor(num_cols_p, cat_cols)
    X_tr_m = meta_prep.fit_transform(df_tr)
    X_cal_m, X_te_m = meta_prep.transform(df_cal), meta_prep.transform(df_te)
    X_tr_emb, X_cal_emb, X_te_emb = add_embedding_noise(emb_tr), emb_cal, emb_te

    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
    tr_oof_a, tr_oof_m = np.zeros(len(y_tr)), np.zeros(len(y_tr))

    for i_tr, i_val in inner_cv.split(X_tr_emb, y_tr):
        n_p, n_n = int(y_tr[i_tr].sum()), int((y_tr[i_tr]==0).sum())
        clf_a_i = build_audio_expert(n_p, n_n)
        clf_a_i.fit(add_embedding_noise(X_tr_emb[i_tr]), y_tr[i_tr])
        tr_oof_a[i_val] = clf_a_i.predict_proba(X_tr_emb[i_val])[:, 1]
        
        clf_m_i = build_clinical_expert(n_p, n_n)
        clf_m_i.fit(X_tr_m[i_tr], y_tr[i_tr])
        tr_oof_m[i_val] = clf_m_i.predict_proba(X_tr_m[i_val])[:, 1]

    n_pos, n_neg = int(y_tr.sum()), int((y_tr==0).sum())
    clf_a = build_audio_expert(n_pos, n_neg).fit(X_tr_emb, y_tr)
    clf_m = build_clinical_expert(n_pos, n_neg).fit(X_tr_m, y_tr)

    X_tr_stack  = np.column_stack([tr_oof_a, tr_oof_m, X_tr_m])
    X_cal_stack = np.column_stack([clf_a.predict_proba(X_cal_emb)[:,1], clf_m.predict_proba(X_cal_m)[:,1], X_cal_m])
    X_te_stack  = np.column_stack([clf_a.predict_proba(X_te_emb)[:,1], clf_m.predict_proba(X_te_m)[:,1], X_te_m])

    supervisor = build_supervisor(n_pos, n_neg).fit(X_tr_stack, y_tr)
    cal_supervisor = CalibratedClassifierCV(supervisor, cv="prefit", method="sigmoid").fit(X_cal_stack, y_cal)

    te_prob_stack = cal_supervisor.predict_proba(X_te_stack)[:, 1]
    oof_stack[te_idx] = te_prob_stack
    print(f"[*] Fold {fold_i+1} | AUC={roc_auc_score(y_te, te_prob_stack):.4f} | pAUC@90%={partial_auc(y_te, te_prob_stack, 0.90):.4f}")

    # EXPORTING PER-FOLD WEIGHTS
    fold_tag = f"fold{fold_i+1}"
    joblib.dump(clf_a, os.path.join(MODEL_DIR, f"{fold_tag}_audio_expert.pkl"))
    joblib.dump(clf_m, os.path.join(MODEL_DIR, f"{fold_tag}_clinical_expert.pkl"))
    joblib.dump(supervisor, os.path.join(MODEL_DIR, f"{fold_tag}_supervisor.pkl"))
    joblib.dump(cal_supervisor, os.path.join(MODEL_DIR, f"{fold_tag}_calibrated_supervisor.pkl"))
    joblib.dump(meta_prep, os.path.join(MODEL_DIR, f"{fold_tag}_meta_preprocessor.pkl"))

# ── 7. FINAL FULL-DATA MODEL DEPLOYMENT & EVALUATION ─────────────────────────
print("\n" + "="*60)
print("5. SAVING FINAL DEPLOYMENT MODEL & ARTIFACTS")
print("="*60)

participant_df["pred_stack"] = oof_stack
m_part = full_eval(participant_df["label"], participant_df["pred_stack"])

# Save evaluation metrics as JSON / text
with open(os.path.join(EVAL_DIR, "v10_metrics.json"), "w") as f: json.dump(m_part, f, indent=2)

summary_rows = [{"Model": "V10 (Multi-Window + Participant-Level)", "ROC-AUC": f"{m_part.get('roc_auc', 0):.4f}", 
                 "pAUC@85%": f"{m_part.get('pauc_85', 0):.4f}", "pAUC@90%": f"{m_part.get('pauc_90', 0):.4f}",
                 "Spec@Sens=85%": f"{m_part.get('tuned_thresholds',{}).get('sens_85',{}).get('specificity',0):.4f}",
                 "Spec@Sens=90%": f"{m_part.get('tuned_thresholds',{}).get('sens_90',{}).get('specificity',0):.4f}",
                 "Spec@Sens=95%": f"{m_part.get('tuned_thresholds',{}).get('sens_95',{}).get('specificity',0):.4f}"}]
summary_df = pd.DataFrame(summary_rows)

with open(os.path.join(EVAL_DIR, "v10_metrics_readable.txt"), "w") as f:
    f.write("V10 FINAL EVALUATION\n" + "="*50 + "\n" + summary_df.to_string(index=False) + "\n\n")
    for sk, sv in m_part.get("tuned_thresholds", {}).items():
        f.write(f"[{sk}] threshold={sv['threshold']:.4f}  sens={sv['sensitivity']:.3f}  spec={sv['specificity']:.3f}  f1={sv['f1']:.3f}\n")

# Save Visual Plots
plot_curves(participant_df["label"], participant_df["pred_stack"], f"{FUSION_OUT}/plots/v10_participant", "V10 Participant-Level")
best_t_90 = m_part["tuned_thresholds"]["sens_90"]["threshold"]
plot_confusion_matrix_sns(participant_df["label"], participant_df["pred_stack"], 
                          f"{FUSION_OUT}/plots/v10_participant_cm.png", threshold=best_t_90, title=f"V10 CM (Sens ≥ 90%)")

# Train ONE final deployable model on 100% of data
print("[*] Training final full-data models for deployment...")
df_all, y_all, X_all_emb = participant_df.copy().reset_index(drop=True), participant_df["label"].values, participant_embs.copy()

cal_fold_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
tr_sub_idx_final, cal_idx_final = next(cal_fold_final.split(df_all, y_all))
df_tr_final, df_cal_final = df_all.iloc[tr_sub_idx_final].reset_index(drop=True), df_all.iloc[cal_idx_final].reset_index(drop=True)
X_tr_emb_final, X_cal_emb_final = X_all_emb[tr_sub_idx_final], X_all_emb[cal_idx_final]
y_tr_final, y_cal_final = df_tr_final["label"].values, df_cal_final["label"].values

meta_prep_final = build_meta_preprocessor(num_cols_p, cat_cols)
X_tr_m_final, X_cal_m_final, X_all_m_final = meta_prep_final.fit_transform(df_tr_final), meta_prep_final.transform(df_cal_final), meta_prep_final.transform(df_all)

X_tr_emb_final_noisy, X_cal_emb_final_clean, X_all_emb_clean = add_embedding_noise(X_tr_emb_final), X_cal_emb_final, X_all_emb
inner_cv_final = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
tr_oof_a_final, tr_oof_m_final = np.zeros(len(y_tr_final)), np.zeros(len(y_tr_final))

for i_tr, i_val in inner_cv_final.split(X_tr_emb_final_noisy, y_tr_final):
    n_p_i, n_n_i = int(y_tr_final[i_tr].sum()), int((y_tr_final[i_tr] == 0).sum())
    clf_a_i = build_audio_expert(n_p_i, n_n_i).fit(add_embedding_noise(X_tr_emb_final_noisy[i_tr]), y_tr_final[i_tr])
    tr_oof_a_final[i_val] = clf_a_i.predict_proba(X_tr_emb_final_noisy[i_val])[:, 1]
    clf_m_i = build_clinical_expert(n_p_i, n_n_i).fit(X_tr_m_final[i_tr], y_tr_final[i_tr])
    tr_oof_m_final[i_val] = clf_m_i.predict_proba(X_tr_m_final[i_val])[:, 1]

n_pos_all, n_neg_all = int(y_tr_final.sum()), int((y_tr_final == 0).sum())
clf_a_final = build_audio_expert(n_pos_all, n_neg_all).fit(X_tr_emb_final_noisy, y_tr_final)
clf_m_final = build_clinical_expert(n_pos_all, n_neg_all).fit(X_tr_m_final, y_tr_final)

X_tr_stack_final  = np.column_stack([tr_oof_a_final, tr_oof_m_final, X_tr_m_final])
X_cal_stack_final = np.column_stack([clf_a_final.predict_proba(X_cal_emb_final_clean)[:, 1], clf_m_final.predict_proba(X_cal_m_final)[:, 1], X_cal_m_final])

supervisor_final = build_supervisor(n_pos_all, n_neg_all).fit(X_tr_stack_final, y_tr_final)
cal_supervisor_final = CalibratedClassifierCV(supervisor_final, cv="prefit", method="sigmoid").fit(X_cal_stack_final, y_cal_final)

# EXPORTING MASTER DEPLOYMENT WEIGHTS
joblib.dump(meta_prep_final,        os.path.join(MODEL_DIR, "final_meta_preprocessor.pkl"))
joblib.dump(clf_a_final,            os.path.join(MODEL_DIR, "final_audio_expert.pkl"))
joblib.dump(clf_m_final,            os.path.join(MODEL_DIR, "final_clinical_expert.pkl"))
joblib.dump(supervisor_final,       os.path.join(MODEL_DIR, "final_supervisor.pkl"))
joblib.dump(cal_supervisor_final,   os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl"))

deploy_info = {
    "model_name": "TB SCREENING RANKER V10",
    "seed": SEED, "sr": SR, "win_samples": WIN_SAMPLES, "hop_samples": HOP_SAMPLES,
    "embed_dim": EMBED_DIM, "agg_funcs": AGG_FUNCS, "agg_dim": AGG_DIM,
    "num_cols_p": num_cols_p, "cat_cols": cat_cols,
    "target_sens_thresholds": {k: float(v["threshold"]) for k, v in m_part.get("tuned_thresholds", {}).items()}
}
with open(os.path.join(MODEL_DIR, "final_inference_config.json"), "w") as f: json.dump(deploy_info, f, indent=2)

summary_df.to_csv(f"{FUSION_OUT}/v10_summary.csv", index=False)
participant_df.to_csv(f"{FUSION_OUT}/v10_oof_predictions.csv", index=False)

# ZIP EVERYTHING FOR DOWNLOAD
zip_path = "/kaggle/working/outputs_v10.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUT_ROOT):
        for fn in files: zf.write(os.path.join(root, fn), os.path.relpath(os.path.join(root, fn), "/kaggle/working"))

print("\n✅ V10 PIPELINE COMPLETE & ZIPPED!")
print("\n--- DOWNLOAD LINKS ---")
display(FileLink(zip_path))
display(FileLink(f"{FUSION_OUT}/plots/v10_participant_roc.png"))
display(FileLink(f"{FUSION_OUT}/plots/v10_participant_cm.png"))
display(FileLink(os.path.join(EVAL_DIR, "v10_metrics_readable.txt")))
display(FileLink(os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl")))


1. LOADING & HARMONISING DATA

--------------------------------------------------
🔍 FEATURE AUDIT: METADATA & CLINICAL VARIABLES
--------------------------------------------------
[*] Total Available Features in Metadata (18):
['participant_id', 'sex', 'age', 'height', 'weight', 'reported_cough_dur', 'tb_prior', 'tb_prior_Pul', 'tb_prior_Extrapul', 'tb_prior_Unknown', 'hemoptysis', 'heart_rate', 'temperature', 'weight_loss', 'smoke_lweek', 'fever', 'night_sweats', 'label_raw']

[*] Excluded Features (Data Leakage/IDs) (2):
['participant_id', 'label_raw']

2. LOADING GOOGLE HeAR MODEL (GATED REPO FIX)


2026-02-22 15:25:33.671475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771773933.847503      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771773933.897366      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771773934.336809      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771773934.336861      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771773934.336864      55 computation_placer.cc:177] computation placer alr

Fetching 24 files:   0%|          | 0/24 [00:00<?, ?it/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

event_detector/event_detector_large/fing(…):   0%|          | 0.00/79.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

event_detector/event_detector_large/save(…):   0%|          | 0.00/4.89M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

event_detector/event_detector_large/vari(…):   0%|          | 0.00/12.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.24k [00:00<?, ?B/s]

event_detector/event_detector_large/kera(…):   0%|          | 0.00/760k [00:00<?, ?B/s]

variables.index:   0%|          | 0.00/5.08k [00:00<?, ?B/s]

event_detector/event_detector_small/kera(…):   0%|          | 0.00/644k [00:00<?, ?B/s]

event_detector/event_detector_small/save(…):   0%|          | 0.00/4.01M [00:00<?, ?B/s]

event_detector/event_detector_small/fing(…):   0%|          | 0.00/76.0 [00:00<?, ?B/s]

event_detector/event_detector_small/vari(…):   0%|          | 0.00/3.95M [00:00<?, ?B/s]

variables.index:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

event_detector/spectrogram_frontend/kera(…):   0%|          | 0.00/10.7k [00:00<?, ?B/s]

event_detector/spectrogram_frontend/fing(…):   0%|          | 0.00/55.0 [00:00<?, ?B/s]

event_detector/spectrogram_frontend/save(…):   0%|          | 0.00/340k [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/3.98M [00:00<?, ?B/s]

variables.index:   0%|          | 0.00/286 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/24.1k [00:00<?, ?B/s]

gitattributes:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

variables/variables.data-00000-of-00001:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

variables.index:   0%|          | 0.00/6.57k [00:00<?, ?B/s]



I0000 00:00:1771773952.298320      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15511 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0



[*] Pre-fetching multi-window HeAR embeddings...


Extracting embeddings:   0%|          | 0/9772 [00:00<?, ?it/s]

I0000 00:00:1771773974.183892     164 service.cc:152] XLA service 0x7d04c98ab8f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1771773974.183933     164 service.cc:160]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1771773974.517430     164 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1771773976.758527     164 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



[*] Aggregating to participant level...

[*] Final USED Numerical Features (8):
['age', 'height', 'weight', 'reported_cough_dur', 'heart_rate', 'temperature', 'n_recordings', 'n_cough_windows_total']

[*] Final USED Categorical Features (10):
['sex', 'tb_prior', 'tb_prior_Pul', 'tb_prior_Extrapul', 'tb_prior_Unknown', 'hemoptysis', 'weight_loss', 'smoke_lweek', 'fever', 'night_sweats']
--------------------------------------------------

4. STARTING V10 TRAINING & WEIGHT EXPORT

--- FOLD 1/5 ---
[*] Fold 1 | AUC=0.8364 | pAUC@90%=0.9681

--- FOLD 2/5 ---
[*] Fold 2 | AUC=0.8297 | pAUC@90%=0.9682

--- FOLD 3/5 ---
[*] Fold 3 | AUC=0.7926 | pAUC@90%=0.9815

--- FOLD 4/5 ---
[*] Fold 4 | AUC=0.7877 | pAUC@90%=0.9619

--- FOLD 5/5 ---
[*] Fold 5 | AUC=0.7427 | pAUC@90%=0.9480

5. SAVING FINAL DEPLOYMENT MODEL & ARTIFACTS
[*] Training final full-data models for deployment...

✅ V10 PIPELINE COMPLETE & ZIPPED!

--- DOWNLOAD LINKS ---


In [3]:
# ── EXPORTING HeAR MODEL FOR LOCAL DEPLOYMENT ──────────────────────────────
print("\n" + "="*60)
print("EXPORTING GOOGLE HeAR MODEL (RAW FILES)")
print("="*60)

import os, shutil
from huggingface_hub import snapshot_download
from IPython.display import FileLink, display

hear_save_dir = "/kaggle/working/hear_saved_model"
os.makedirs(hear_save_dir, exist_ok=True)

# 1. Download the raw TF SavedModel directory directly from Hugging Face
print("[*] Downloading raw model files directly to disk...")
snapshot_download(
    repo_id="google/hear", 
    local_dir=hear_save_dir,
    # We only need the actual model files, ignoring git or markdown files
    ignore_patterns=["*.md", ".gitattributes"] 
)

# 2. Zip the folder so it can be downloaded easily
print("[*] Zipping the model files for download...")
hear_zip_path = "/kaggle/working/hear_model_offline.zip"
shutil.make_archive(hear_zip_path.replace('.zip', ''), 'zip', hear_save_dir)

# 3. Generate the clickable download link
print("\n✅ HeAR Model Downloaded and Zipped for Offline Use!")
display(FileLink("hear_model_offline.zip"))


EXPORTING GOOGLE HeAR MODEL (RAW FILES)
[*] Downloading raw model files directly to disk...


Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]

[*] Zipping the model files for download...

✅ HeAR Model Downloaded and Zipped for Offline Use!


In [None]:
# Run this cell, then immediately go to Run -> Restart Session in the top menu
!pip install --upgrade -q keras keras-hub

In [None]:
import os

# Must be set to JAX BEFORE importing keras
os.environ["KERAS_BACKEND"] = "jax"
# Cap JAX memory so it doesn't crash Pandas/LightGBM (Adjusted to 0.75 for safety)
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.75" 

import joblib
import numpy as np
import pandas as pd
import keras
import keras_hub

# 1. CRITICAL: Force Keras to use half-precision. 
# This shrinks MedGemma 4B from 16GB to 8GB so it fits on a Kaggle T4!
keras.config.set_floatx("float16")

# ============================================================================
# 2. SETUP: LOAD THE V10 ML MODELS
# ============================================================================
print("Loading V10 ML Models...")
MODEL_DIR = "/kaggle/input/datasets/jroot888/medgemma-trained-weights/outputs_v10/models"

meta_prep      = joblib.load(os.path.join(MODEL_DIR, "final_meta_preprocessor.pkl"))
clf_a          = joblib.load(os.path.join(MODEL_DIR, "final_audio_expert.pkl"))
clf_m          = joblib.load(os.path.join(MODEL_DIR, "final_clinical_expert.pkl"))
cal_supervisor = joblib.load(os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl"))

# ============================================================================
# 3. SIMULATE A WAITING ROOM (Standalone Version)
# ============================================================================
print("Simulating Clinic Waiting Room...")
# Load actual clinical data from your V10 output CSV
csv_path = "/kaggle/input/datasets/jroot888/medgemma-trained-weights/outputs_v10/multiwindow_participant/v10_oof_predictions.csv"
df_all = pd.read_csv(csv_path)
df_waiting_room = df_all.sample(10, random_state=99).reset_index(drop=True)

# Generate dummy HeAR embeddings (2560-dim) just for this UI/Agent test
X_audio_waiting_room = np.random.randn(10, 2560).astype(np.float32)

# ============================================================================
# 4. THE RANKING ENGINE
# ============================================================================
def rank_patients(df_meta, X_audio):
    X_m_processed = meta_prep.transform(df_meta)
    
    prob_a = clf_a.predict_proba(X_audio)[:, 1]
    prob_m = clf_m.predict_proba(X_m_processed)[:, 1]
    
    X_stack = np.column_stack([prob_a, prob_m, X_m_processed])
    final_scores = cal_supervisor.predict_proba(X_stack)[:, 1]
    
    desired_cols = ['participant_id', 'age', 'sex', 'weight_loss', 'night_sweats']
    display_cols = [c for c in desired_cols if c in df_meta.columns]
    
    df_queue = df_meta[display_cols].copy()
    df_queue['audio_risk']  = np.round(prob_a, 3)
    df_queue['clinic_risk'] = np.round(prob_m, 3)
    df_queue['final_triage_score'] = np.round(final_scores, 3)
    
    df_queue = df_queue.sort_values(by='final_triage_score', ascending=False).reset_index(drop=True)
    df_queue.index = df_queue.index + 1 
    return df_queue

ranked_queue = rank_patients(df_waiting_room, X_audio_waiting_room)
print("\n--- SORTED TRIAGE QUEUE ---")
print(ranked_queue[['participant_id', 'audio_risk', 'clinic_risk', 'final_triage_score']])

# ============================================================================
# 5. LOAD MEDGEMMA 4B (Memory-Optimized)
# ============================================================================
print("\nLoading MedGemma 4B into GPU... (This takes a minute)")
MEDGEMMA_PATH = "/kaggle/input/models/keras/medgemma/keras/medgemma_4b/1"

try:
    # dtype="float16" combined with set_floatx prevents the OOM crash
    medgemma = keras_hub.models.CausalLM.from_preset(MEDGEMMA_PATH, dtype="float16")
    medgemma.compile(sampler="greedy") 
    print("✓ MedGemma 4B Loaded Successfully!")
except Exception as e:
    print(f"⚠ Warning: MedGemma failed to load.\nError: {e}")
    medgemma = None 

# ============================================================================
# 6. EXECUTE THE AGENTIC WORKFLOW
# ============================================================================
def generate_clinical_justification(patient_row):
    age = patient_row.get('age', 'Unknown')
    sex = patient_row.get('sex', 'Unknown')
    weight_loss = patient_row.get('weight_loss', 'Unknown')
    night_sweats = patient_row.get('night_sweats', 'Unknown')
    
    prompt = (
        f"You are an expert AI Triage Assistant in a tuberculosis clinic. "
        f"You have limited GeneXpert tests available today.\n\n"
        f"PATIENT DATA:\n"
        f"- Demographics: {age} year old {sex}.\n"
        f"- Symptoms: Weight loss is {weight_loss}, Night Sweats is {night_sweats}.\n"
        f"- AI Assessment: Acoustic Cough Risk Score is {patient_row['audio_risk']} out of 1.0. "
        f"Clinical Risk Score is {patient_row['clinic_risk']} out of 1.0.\n\n"
        f"TASK: Write a concise, professional 2-sentence clinical justification explaining why this patient "
        f"has been assigned a high priority triage rank for a GeneXpert test today.\n\n"
        f"JUSTIFICATION:\n"
    )
    
    if medgemma is None: return "LLM Not Loaded."
    
    response = medgemma.generate(prompt, max_length=256)
    # Clean up the output so it only prints the new text
    justification = response.replace(prompt, "").strip()
    return justification

print("\n--- MEDGEMMA CLINICAL EXPLANATIONS FOR TOP 3 PATIENTS ---")
for rank, patient in ranked_queue.head(3).iterrows():
    print(f"\nEvaluating Rank #{rank} (ID: {patient.get('participant_id', 'Unknown')})")
    print("Generating LLM Justification...")
    explanation = generate_clinical_justification(patient)
    print(f"MEDGEMMA OUTPUT:\n{explanation}")

In [None]:
# 1. Forcefully rip out the old pre-installed versions
!pip uninstall -y -q keras-hub keras-nlp

# 2. Install the absolute latest versions cleanly
!pip install -q -U keras-hub
!pip install -q -U keras

In [None]:
import keras_hub
print(keras_hub.__version__)

In [1]:
import os

# 1. ENVIRONMENT SETUP
# Must be set BEFORE importing keras
os.environ["KERAS_BACKEND"] = "jax"
# Cap JAX memory so it doesn't crash LightGBM on the P100
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.75" 

import joblib
import numpy as np
import pandas as pd
import keras
import keras_hub

# Force Keras to use half-precision to fit MedGemma 4B into VRAM
keras.config.set_floatx("float16")

# ============================================================================
# 2. LOAD THE V10 ML MODELS
# ============================================================================
print("Loading V10 ML Models...")
MODEL_DIR = "/kaggle/input/datasets/jroot888/medgemma-trained-weights/outputs_v10/models"

meta_prep      = joblib.load(os.path.join(MODEL_DIR, "final_meta_preprocessor.pkl"))
clf_a          = joblib.load(os.path.join(MODEL_DIR, "final_audio_expert.pkl"))
clf_m          = joblib.load(os.path.join(MODEL_DIR, "final_clinical_expert.pkl"))
cal_supervisor = joblib.load(os.path.join(MODEL_DIR, "final_calibrated_supervisor.pkl"))

# ============================================================================
# 3. SIMULATE A WAITING ROOM
# ============================================================================
print("Simulating Clinic Waiting Room...")
csv_path = "/kaggle/input/datasets/jroot888/medgemma-trained-weights/outputs_v10/multiwindow_participant/v10_oof_predictions.csv"
df_all = pd.read_csv(csv_path)
df_waiting_room = df_all.sample(10, random_state=99).reset_index(drop=True)

# Dummy HeAR embeddings (2560-dim)
X_audio_waiting_room = np.random.randn(10, 2560).astype(np.float32)

# ============================================================================
# 4. THE RANKING ENGINE
# ============================================================================
def rank_patients(df_meta, X_audio):
    X_m_processed = meta_prep.transform(df_meta)
    
    prob_a = clf_a.predict_proba(X_audio)[:, 1]
    prob_m = clf_m.predict_proba(X_m_processed)[:, 1]
    
    X_stack = np.column_stack([prob_a, prob_m, X_m_processed])
    final_scores = cal_supervisor.predict_proba(X_stack)[:, 1]
    
    desired_cols = ['participant_id', 'age', 'sex', 'weight_loss', 'night_sweats']
    display_cols = [c for c in desired_cols if c in df_meta.columns]
    
    df_queue = df_meta[display_cols].copy()
    df_queue['audio_risk']  = np.round(prob_a, 3)
    df_queue['clinic_risk'] = np.round(prob_m, 3)
    df_queue['final_triage_score'] = np.round(final_scores, 3)
    
    df_queue = df_queue.sort_values(by='final_triage_score', ascending=False).reset_index(drop=True)
    df_queue.index = df_queue.index + 1 
    return df_queue

ranked_queue = rank_patients(df_waiting_room, X_audio_waiting_room)
print("\n--- SORTED TRIAGE QUEUE ---")
print(ranked_queue[['participant_id', 'audio_risk', 'clinic_risk', 'final_triage_score']])

# ============================================================================
# 5. LOAD MEDGEMMA 4B (Optimized Sampler)
# ============================================================================
print("\nLoading MedGemma 4B into GPU... (This takes a minute)")
MEDGEMMA_PATH = "/kaggle/input/models/keras/medgemma/keras/medgemma_4b/1"

try:
    medgemma = keras_hub.models.CausalLM.from_preset(MEDGEMMA_PATH, dtype="float16")
    
    # BEST ACCURACY SETUP: Top-P sampler stops the repeating loops
    sampler = keras_hub.samplers.TopPSampler(p=0.9, temperature=0.2)
    medgemma.compile(sampler=sampler) 
    print("✓ MedGemma 4B Loaded Successfully!")
except Exception as e:
    print(f"⚠ Warning: MedGemma failed to load.\nError: {e}")
    medgemma = None 

# ============================================================================
# 6. EXECUTE BILINGUAL AGENTIC WORKFLOW (P100 Safe - Sequential)
# ============================================================================
def build_bilingual_prompt(patient_row):
    prompt = f"""You are an expert AI Triage Assistant in a tuberculosis clinic. Write a strict, 2-sentence clinical justification for GeneXpert testing in English, followed immediately by its Hindi translation.

EXAMPLE INPUT:
- Demographics: 45 year old Male.
- Symptoms: Weight loss is Yes, Night Sweats is No.
- AI Assessment: Acoustic Cough Risk Score is 0.850. Clinical Risk Score is 0.720.

EXAMPLE OUTPUT:
English: This 45-year-old male presents with weight loss and highly elevated risk scores indicating probable tuberculosis. Immediate GeneXpert testing is prioritized to confirm active pulmonary infection.
Hindi: यह 45 वर्षीय पुरुष वजन कम होने और अत्यधिक जोखिम स्कोर के साथ प्रस्तुत होता है जो संभावित तपेदिक का संकेत देता है। सक्रिय फुफ्फुसीय संक्रमण की पुष्टि करने के लिए तत्काल जीनएक्सपर्ट परीक्षण को प्राथमिकता दी जाती है।

REAL INPUT:
- Demographics: {patient_row.get('age', 'Unknown')} year old {patient_row.get('sex', 'Unknown')}.
- Symptoms: Weight loss is {patient_row.get('weight_loss', 'Unknown')}, Night Sweats is {patient_row.get('night_sweats', 'Unknown')}.
- AI Assessment: Acoustic Cough Risk Score is {patient_row['audio_risk']}. Clinical Risk Score is {patient_row['clinic_risk']}.

REAL OUTPUT:
"""
    return prompt

if medgemma is not None:
    print("\n--- GENERATING BILINGUAL EXPLANATIONS (SEQUENTIAL) ---")
    
    top_patients = ranked_queue.head(3)
    
    for idx, patient in top_patients.iterrows():
        print(f"\nEvaluating Rank #{idx} (ID: {patient.get('participant_id', 'Unknown')})")
        print("Generating LLM Justification...")
        
        # 1. Build the prompt for just this ONE patient
        prompt = build_bilingual_prompt(patient)
        
        # 2. Generate sequentially (Safe for P100 architecture)
        response = medgemma.generate(prompt, max_length=512)
        
        # 3. Clean and print
        clean_output = response.replace(prompt, "").strip()
        print(f"MEDGEMMA OUTPUT:\n{clean_output}")
else:
    print("LLM Not Loaded. Cannot execute agent workflow.")

2026-02-21 17:55:09.319068: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771696509.340156     725 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771696509.346909     725 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771696509.363514     725 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771696509.363535     725 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771696509.363537     725 computation_placer.cc:177] computation placer alr

Loading V10 ML Models...
Simulating Clinic Waiting Room...

--- SORTED TRIAGE QUEUE ---
   participant_id  audio_risk  clinic_risk  final_triage_score
1    CODA_TB_0575       0.091        0.847               0.590
2    CODA_TB_0287       0.103        0.756               0.547
3    CODA_TB_0492       0.096        0.622               0.381
4    CODA_TB_1056       0.057        0.191               0.084
5    CODA_TB_0590       0.063        0.172               0.069
6    CODA_TB_0807       0.052        0.206               0.067
7    CODA_TB_0156       0.053        0.079               0.062
8    CODA_TB_0889       0.080        0.097               0.058
9    CODA_TB_0133       0.116        0.232               0.057
10   CODA_TB_1107       0.086        0.069               0.052

Loading MedGemma 4B into GPU... (This takes a minute)


I0000 00:00:1771696521.281371     725 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3297 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


✓ MedGemma 4B Loaded Successfully!

--- GENERATING BILINGUAL EXPLANATIONS (SEQUENTIAL) ---

Evaluating Rank #1 (ID: CODA_TB_0575)
Generating LLM Justification...


XlaRuntimeError: UNIMPLEMENTED: Unsupported algorithm on the current device(s): ALG_DOT_F16_F16_F32