Baseline model

a. Catboost for tabular data

In [None]:
import pandas as pd
import numpy as np

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

save_dir = "/content/drive/MyDrive/CIS_678_final_project/"

from pathlib import Path
DATA = Path('/content/drive/MyDrive/CIS_678_final_project')

import pandas as pd
def read_tabular(fname):  # tabular CSVs with timestamp column "t"
    return pd.read_csv(DATA/fname, parse_dates=["t"], na_values=["NULL"])

def read_text(fname):
    return pd.read_csv(DATA/fname, na_values=["NULL"])

from sklearn.metrics import roc_auc_score, average_precision_score
def report_metrics(y_true, p):
    auroc = roc_auc_score(y_true, p)
    auprc = average_precision_score(y_true, p)
    return {"AUROC": auroc, "AUPRC": auprc}


In [None]:
import numpy as np
import pandas as pd


train_tab = read_tabular("features6h_rich_train.csv")
test_tab = read_tabular("features6h_rich_test.csv")


# counts of 0 and 1
y_tr = train_tab["label6h"].astype(int)
counts_tr = y_tr.value_counts().sort_index()
print(counts_tr)

y_te = test_tab["label6h"].astype(int)
counts_te = y_te.value_counts().sort_index()
print(counts_te)

In [None]:
train_tab

In [None]:
train_tab.iloc[15:20, :15]

In [None]:
train_tab_columns = train_tab.columns
train_tab_columns

In [None]:
!pip install -q catboost
import numpy as np, pandas as pd, os, json, hashlib
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

TRAIN_CSV = "/content/drive/MyDrive/CIS_678_final_project/features6h_rich_train.csv"
TEST_CSV  = "/content/drive/MyDrive/CIS_678_final_project/features6h_rich_test.csv"

train_tab = pd.read_csv(TRAIN_CSV, parse_dates=["t"], low_memory=False)
test_tab  = pd.read_csv(TEST_CSV,  parse_dates=["t"], low_memory=False)

GROUP = ["subject_id","hadm_id","icustay_id"]
TIME  = "t"
LABEL = "label6h"

# Safety: no train/test overlap
for key in GROUP:
    overlap = set(train_tab[key]).intersection(set(test_tab[key]))
    assert len(overlap) == 0, f"Train/Test leakage via {key}: {len(overlap)} overlaps"

# cast categoricals to string
for c in ["sex", "dbsource"]:
    if c in train_tab.columns: train_tab[c] = train_tab[c].astype(str)
    if c in test_tab.columns:  test_tab[c]  = test_tab[c].astype(str)

# speed: float32 and replace inf
for df in (train_tab, test_tab):
    num_cols_df = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    df[num_cols_df] = df[num_cols_df].astype("float32")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

EXCLUDE = set(GROUP + [TIME, LABEL, "split"])
CAT_KEEP = [c for c in ["sex","dbsource"] if c in train_tab.columns]

def is_num(col): return pd.api.types.is_numeric_dtype(train_tab[col])
num_cols = [c for c in train_tab.columns if c not in EXCLUDE and is_num(c) and c not in CAT_KEEP]

# order-preserving dedup; ensure test has same columns
def dedup(seq): return list(dict.fromkeys(seq))
FEATURES = [c for c in dedup(num_cols + CAT_KEEP) if c in test_tab.columns]

def cat_idx(all_cols, cats):
    s = set(cats)
    return [i for i, c in enumerate(all_cols) if c in s]

cat_features_idx = cat_idx(FEATURES, [c for c in CAT_KEEP if c in FEATURES])

X      = train_tab[FEATURES]
Xt     = test_tab[FEATURES]
y      = train_tab[LABEL].astype(int).to_numpy()
y_test = test_tab[LABEL].astype(int).to_numpy()


tvals = pd.to_datetime(train_tab[TIME]).to_numpy()
order = np.argsort(tvals)
N_CHUNKS = 5
fold_ids = np.full(len(train_tab), -1, int)
for k, idx in enumerate(np.array_split(order, N_CHUNKS)):
    fold_ids[idx] = k
groups = train_tab["hadm_id"].to_numpy()

pos_rate = float(y.mean())
scale_pos_weight = (1.0 - pos_rate) / max(pos_rate, 1e-12)

# Try GPU; fallback to CPU if it errors later
USE_GPU = True  # set False if you want to force CPU

cb_params = dict(
    task_type=("GPU" if USE_GPU else "CPU"),
    devices="0" if USE_GPU else None,
    loss_function="Logloss",
    eval_metric="PRAUC",
    depth=6,
    learning_rate=0.05,
    iterations=2000,
    l2_leaf_reg=3.0,
    class_weights=[1.0, scale_pos_weight],
    bootstrap_type="Bernoulli",
    subsample=0.6,
    # rsm=0.8,
    leaf_estimation_iterations=4,
    use_best_model=True,
    od_type="Iter",
    od_wait=120,
    random_seed=42,
    verbose=False,
    allow_writing_files=False,
)

# If you want RSM for CPU runs, add it conditionally:
if not USE_GPU:
    cb_params["rsm"] = 0.8

def metrics(y_true, p):
    return {"AUROC": roc_auc_score(y_true, p),
            "AUPRC": average_precision_score(y_true, p)}

oof = np.full(len(train_tab), np.nan, float)

for k in range(1, N_CHUNKS):
    va_idx = np.where(fold_ids == k)[0]
    tr_pool_idx = np.where(fold_ids <  k)[0]

    # HADM-disjoint
    va_hadm = set(groups[va_idx])
    tr_idx = tr_pool_idx[~np.isin(groups[tr_pool_idx], list(va_hadm))]

    if len(tr_idx)==0 or len(va_idx)==0:
        print(f"[Base] skip chunk {k}: tr={len(tr_idx)} va={len(va_idx)}")
        continue

    cb = CatBoostClassifier(**cb_params)
    cb.fit(
        Pool(X.iloc[tr_idx], y[tr_idx], cat_features=cat_features_idx),
        eval_set=Pool(X.iloc[va_idx], y[va_idx], cat_features=cat_features_idx)
    )
    oof[va_idx] = cb.predict_proba(X.iloc[va_idx])[:,1]
    print(f"[Base] chunk {k}:", metrics(y[va_idx], oof[va_idx]))

used = ~np.isnan(oof)
print("[Base] OOF (valid rows):", metrics(y[used], oof[used]))


def sweep_thresholds(y_true, p, grid=None):
    grid = grid or np.linspace(0.05, 0.95, 19)
    rows = []
    for thr in grid:
        yhat = (p >= thr).astype(int)
        rows.append({
            "threshold": float(thr),
            "precision": precision_score(y_true, yhat, zero_division=0),
            "recall":    recall_score(y_true, yhat, zero_division=0),
            "f1":        f1_score(y_true, yhat, zero_division=0)
        })
    return pd.DataFrame(rows)

thr_table = sweep_thresholds(y[used], oof[used])
best_row = thr_table.sort_values(["f1","recall","precision"], ascending=False).iloc[0]
BEST_THR = float(best_row["threshold"])
print("\n[Base] OOF threshold table:\n", thr_table)
print(f"\nSelected THRESH={BEST_THR:.3f} | "
      f"P={best_row['precision']:.3f} R={best_row['recall']:.3f} F1={best_row['f1']:.3f}")

# Train full model and score TEST
cb_full = CatBoostClassifier(**{**cb_params, "use_best_model": False, "od_type": None, "verbose": False})
cb_full.fit(Pool(X, y, cat_features=cat_features_idx))
p_test = cb_full.predict_proba(Xt)[:,1]

print("\nTEST prob-metrics:", {"AUROC": roc_auc_score(y_test, p_test),
                              "AUPRC": average_precision_score(y_test, p_test)})
yhat = (p_test >= BEST_THR).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, yhat, labels=[0,1]).ravel()
print("\nTEST @ best OOF threshold:",
      {"threshold": BEST_THR, "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp),
       "precision": precision_score(y_test, yhat, zero_division=0),
       "recall":    recall_score(y_test, yhat, zero_division=0),
       "f1":        f1_score(y_test, yhat, zero_division=0)})

In [None]:
# SAVE for single-pass CatBoost run
import os, json, hashlib
import numpy as np, pandas as pd

OUTDIR = "/content/drive/MyDrive/CIS_678_final_project/catboost_rich6h"
os.makedirs(OUTDIR, exist_ok=True)

LABEL = "label6h"

def mk_row_id(df: pd.DataFrame) -> pd.Series:
    ts = pd.to_datetime(df["t"], errors="coerce").astype(str)
    keys = df["subject_id"].astype(str) + "|" + df["hadm_id"].astype(str) + "|" + df["icustay_id"].astype(str) + "|" + ts
    return keys.apply(lambda s: hashlib.md5(s.encode()).hexdigest())

# Optional predicted-risk lags
PRED_LAG_COLS = [c for c in ["pred_lag1","pred_lag2","pred_lag1_missing","pred_lag2_missing"]
                 if c in train_tab.columns and c in test_tab.columns]

# TRAIN OOF table (keep only rows with OOF preds)
train_keys = ["subject_id","hadm_id","icustay_id","t", LABEL]
keep_tr = [c for c in train_keys + PRED_LAG_COLS + FEATURES if c in train_tab.columns]
oof_df = train_tab.loc[:, keep_tr].copy()
oof_df["row_id"]   = mk_row_id(oof_df)
oof_df["p_cb_oof"] = oof
oof_df["fold_id"]  = fold_ids if "fold_ids" in globals() else -1
oof_df = oof_df.loc[~pd.isna(oof_df["p_cb_oof"])].reset_index(drop=True)

oof_path = os.path.join(OUTDIR, "cb_level1_oof_train_single.csv")
oof_df.to_csv(oof_path, index=False)

# EST predictions table
test_keys = ["subject_id","hadm_id","icustay_id","t", LABEL]
keep_te = [c for c in test_keys + PRED_LAG_COLS + FEATURES if c in test_tab.columns]
test_df = test_tab.loc[:, keep_te].copy()
test_df["row_id"]    = mk_row_id(test_df)
test_df["p_cb_test"] = p_test

test_path = os.path.join(OUTDIR, "cb_level1_preds_test_single.csv")
test_df.to_csv(test_path, index=False)

# Threshold sweep + chosen threshold
if "thr_table" in globals() and "BEST_THR" in globals() and "best_row" in globals():
    thr_path = os.path.join(OUTDIR, "cb_oof_threshold_sweep_single.csv")
    thr_table.to_csv(thr_path, index=False)
    with open(os.path.join(OUTDIR, "cb_selected_threshold_single.json"), "w") as f:
        json.dump({
            "best_threshold": float(BEST_THR),
            "oof_precision": float(best_row["precision"]),
            "oof_recall": float(best_row["recall"]),
            "oof_f1": float(best_row["f1"])
        }, f, indent=2)

# Features actually used
feat_list_path = os.path.join(OUTDIR, "cb_features_used_single.txt")
with open(feat_list_path, "w") as f:
    for c in FEATURES:
        f.write(c + "\n")

schema_cols = ["subject_id","hadm_id","icustay_id","t", LABEL] + FEATURES
schema_cols = [c for c in schema_cols if c in train_tab.columns]
schema = train_tab[schema_cols].dtypes.astype(str).reset_index()
schema.columns = ["column","dtype"]
schema_path = os.path.join(OUTDIR, "cb_feature_schema_single.csv")
schema.to_csv(schema_path, index=False)


model_path = os.path.join(OUTDIR, "cb_full.cbm")
cb_full.save_model(model_path)

params_path = os.path.join(OUTDIR, "cb_params_single.json")
with open(params_path, "w") as f:
    json.dump({**cb_params, "class_weights":[1.0, float(scale_pos_weight)]}, f, indent=2)

meta = {
    "stage": "single",
    "rows_train_oof": int(len(oof_df)),
    "rows_test": int(len(test_df)),
    "positive_rate_train": float(train_tab[LABEL].mean()),
    "positive_rate_test": float(test_tab[LABEL].mean()),
    "features_used_count": int(len(FEATURES)),
    "has_pred_lags": bool(len(PRED_LAG_COLS) > 0)
}
with open(os.path.join(OUTDIR, "cb_metadata_single.json"), "w") as f:
    json.dump(meta, f, indent=2)

with open(os.path.join(OUTDIR, "README_single.txt"), "w") as f:
    f.write(readme_txt)

print("Saved:")
print(" -", oof_path)
print(" -", test_path)
print(" -", feat_list_path, " & ", schema_path)
print(" -", model_path, " & ", params_path)
print(" - cb_metadata_single.json, README_single.txt")


b. TF-IDF + LR model for notes

In [None]:
import os, numpy as np, pandas as pd, warnings, json, hashlib
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score, confusion_matrix
warnings.filterwarnings("ignore")

OUTDIR = "/content/drive/MyDrive/CIS_678_final_project"
os.makedirs(OUTDIR, exist_ok=True)

TRAIN_FILE = os.path.join(OUTDIR, "notes_train_sample.csv.csv")
TEST_FILE  = os.path.join(OUTDIR, "notes_test_sample.csv.csv")

def read_text(path):
    df = pd.read_csv(path)
    for c in ["subject_id","hadm_id","icustay_id","t","notes_24h","label6h"]:
        assert c in df.columns, f"Missing {c}"
    df["notes_24h"] = df["notes_24h"].fillna("")
    df["t"] = pd.to_datetime(df["t"])
    return df

tr = read_text(TRAIN_FILE)
te = read_text(TEST_FILE)
tr

In [None]:
import os, numpy as np, pandas as pd, warnings, json, hashlib
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score, confusion_matrix
warnings.filterwarnings("ignore")

OUTDIR = "/content/drive/MyDrive/CIS_678_final_project"
os.makedirs(OUTDIR, exist_ok=True)

TRAIN_FILE = os.path.join(OUTDIR, "notes_train_sample.csv.csv")
TEST_FILE  = os.path.join(OUTDIR, "notes_test_sample.csv.csv")

def read_text(path):
    df = pd.read_csv(path)
    for c in ["subject_id","hadm_id","icustay_id","t","notes_24h","label6h"]:
        assert c in df.columns, f"Missing {c}"
    df["notes_24h"] = df["notes_24h"].fillna("")
    df["t"] = pd.to_datetime(df["t"])
    return df

tr = read_text(TRAIN_FILE)
te = read_text(TEST_FILE)

GROUP = ["subject_id","hadm_id","icustay_id"]

def add_text_lags(df):
    df = df.sort_values(GROUP + ["t"]).copy()
    g = df.groupby(GROUP, sort=False)
    df["notes_prev1"] = g["notes_24h"].shift(1).fillna("")
    df["notes_prev2"] = g["notes_24h"].shift(2).fillna("")
    df["notes_combo"] = (df["notes_prev2"] + " [SEP] " +
                         df["notes_prev1"] + " [SEP] " +
                         df["notes_24h"])
    return df

tr = add_text_lags(tr)
te = add_text_lags(te)

# time-ordered, admission-disjoint folds
K = 5
order = np.argsort(tr["t"].values)
fold_ids = np.full(len(tr), -1, int)
for k, idx in enumerate(np.array_split(order, K)):
    fold_ids[idx] = k

y_tr = tr["label6h"].astype(int).values
oof  = np.full(len(tr), np.nan, float)

# fixed hashing space avoids tiny per-fold vocab
hv = HashingVectorizer(
    n_features=2**20,
    alternate_sign=False,
    ngram_range=(1,2),
    norm=None,                 
    strip_accents="unicode",
    lowercase=True
)

for k in range(1, K):
    va_idx = np.where(fold_ids == k)[0]
    tr_pool_idx = np.where(fold_ids <  k)[0]
    va_hadm = set(tr.loc[va_idx, "hadm_id"])
    tr_idx = tr_pool_idx[~tr.loc[tr_pool_idx, "hadm_id"].isin(va_hadm)]
    if len(tr_idx)==0 or len(va_idx)==0:
        continue

    X_tr_h = hv.transform(tr.loc[tr_idx, "notes_combo"].astype(str))
    X_va_h = hv.transform(tr.loc[va_idx, "notes_combo"].astype(str))

    tfidf = TfidfTransformer(sublinear_tf=True, smooth_idf=True)
    X_tr  = tfidf.fit_transform(X_tr_h)
    X_va  = tfidf.transform(X_va_h)

    y_tr_fold = y_tr[tr_idx]
    lr = LogisticRegression(
        solver="saga", penalty="l2", C=2.0,
        max_iter=1000, class_weight="balanced", n_jobs=-1,
        random_state=42
    )
    lr.fit(X_tr, y_tr_fold)
    oof[va_idx] = lr.predict_proba(X_va)[:,1]

used = ~np.isnan(oof)
print(f"OOF coverage: used={used.sum()}  nan={(~used).sum()}")
print("TFIDF-Hash LR OOF AUROC/AUPRC:",
      roc_auc_score(y_tr[used], oof[used]),
      average_precision_score(y_tr[used], oof[used]))

# final fit on full training to TEST 
X_full_tr_h = hv.transform(tr["notes_combo"].astype(str))
X_full_te_h = hv.transform(te["notes_combo"].astype(str))
tfidf_full  = TfidfTransformer(sublinear_tf=True, smooth_idf=True).fit(X_full_tr_h)
X_full_tr   = tfidf_full.transform(X_full_tr_h)
X_full_te   = tfidf_full.transform(X_full_te_h)

lr_final = LogisticRegression(
    solver="saga", penalty="l2", C=2.0,
    max_iter=1000, class_weight="balanced", n_jobs=-1, random_state=42
).fit(X_full_tr, y_tr)

y_te = te["label6h"].astype(int).values
p_text_tr_insample = lr_final.predict_proba(X_full_tr)[:,1] 
p_text_te          = lr_final.predict_proba(X_full_te)[:,1]

print("Text TRAIN (in-sample):",
      {"AUROC": roc_auc_score(y_tr, p_text_tr_insample),
       "AUPRC": average_precision_score(y_tr, p_text_tr_insample)})
print("Text TEST:",
      {"AUROC": roc_auc_score(y_te, p_text_te),
       "AUPRC": average_precision_score(y_te, p_text_te)})

# threshold sweep
def sweep_thresholds(y_true, p, grid=None):
    grid = grid or np.arange(0.1, 1.0, 0.1)  
    rows = []
    for thr in grid:
        yhat = (p >= thr).astype(int)
        rows.append({
            "threshold": float(thr),
            "precision": precision_score(y_true, yhat, zero_division=0),
            "recall":    recall_score(y_true, yhat, zero_division=0),
            "f1":        f1_score(y_true, yhat, zero_division=0)
        })
    return pd.DataFrame(rows)

# OOF sweep, choose best-F1 threshold
thr_oof = sweep_thresholds(y_tr[used], oof[used])
best_row = thr_oof.sort_values(["f1","recall","precision"], ascending=False).iloc[0]
BEST_THR = float(best_row["threshold"])

print("\n[OOF] threshold sweep (0.1..0.9):")
print(thr_oof)
print(f"\nSelected OOF THRESH={BEST_THR:.2f} | "
      f"P={best_row['precision']:.3f} R={best_row['recall']:.3f} F1={best_row['f1']:.3f}")

# apply OOF-best threshold to TEST 
yhat_test = (p_text_te >= BEST_THR).astype(int)
tn, fp, fn, tp = confusion_matrix(y_te, yhat_test, labels=[0,1]).ravel()
prec = precision_score(y_te, yhat_test, zero_division=0)
rec  = recall_score(y_te, yhat_test, zero_division=0)
f1   = f1_score(y_te, yhat_test, zero_division=0)

print("\n[TEST] @ OOF-best threshold:",
      {"threshold": BEST_THR, "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp),
       "precision": round(prec,4), "recall": round(rec,4), "f1": round(f1,4)})

In [None]:
# save the files for futher use
import os, json, hashlib
import pandas as pd, numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

OUTDIR = "/content/drive/MyDrive/CIS_678_final_project"
os.makedirs(OUTDIR, exist_ok=True)

def sweep_thresholds(y_true, p, grid=None):
    grid = grid or np.linspace(0.1, 0.9, 9)  
    rows = []
    for thr in grid:
        yhat = (p >= thr).astype(int)
        rows.append({
            "threshold": float(thr),
            "precision": precision_score(y_true, yhat, zero_division=0),
            "recall":    recall_score(y_true, yhat, zero_division=0),
            "f1":        f1_score(y_true, yhat, zero_division=0),
        })
    return pd.DataFrame(rows)

def mk_row_id(df: pd.DataFrame) -> pd.Series:
    keys = (df["subject_id"].astype(str) + "|" +
            df["hadm_id"].astype(str)    + "|" +
            df["icustay_id"].astype(str) + "|" +
            pd.to_datetime(df["t"]).astype(str))
    return keys.apply(lambda s: hashlib.md5(s.encode()).hexdigest())

assert {"subject_id","hadm_id","icustay_id","t","label6h"}.issubset(tr.columns)
assert {"subject_id","hadm_id","icustay_id","t","label6h"}.issubset(te.columns)
assert len(oof) == len(tr), "oof length != train rows"
assert len(p_text_te) == len(te), "test probs length != test rows"


thr_table = sweep_thresholds(y_tr[used], oof[used])
best_row = thr_table.sort_values(["f1","recall","precision"], ascending=False).iloc[0]
BEST_THR = float(best_row["threshold"])
print("[OOF] threshold sweep (0.1..0.9):\n", thr_table)
print(f"\nSelected OOF THRESH={BEST_THR:.2f} | "
      f"P={best_row['precision']:.3f} R={best_row['recall']:.3f} F1={best_row['f1']:.3f}")


tr_oof = tr.loc[used, ["subject_id","hadm_id","icustay_id","t","label6h"]].copy()
tr_oof["row_id"] = mk_row_id(tr_oof)
tr_oof["p_text"] = oof[used]
tr_oof_path = os.path.join(OUTDIR, "oof_text_hash_tfidf_lr.csv")
tr_oof.to_csv(tr_oof_path, index=False)


te_out = te[["subject_id","hadm_id","icustay_id","t","label6h"]].copy()
te_out["row_id"] = mk_row_id(te_out)
te_out["p_text"] = p_text_te
te_out_path = os.path.join(OUTDIR, "pred_text_hash_tfidf_lr_test.csv")
te_out.to_csv(te_out_path, index=False)


thr_oof_path = os.path.join(OUTDIR, "text_oof_thresholds.csv")
thr_table.to_csv(thr_oof_path, index=False)

selected = {
    "selected_oof_threshold": float(BEST_THR),
    "oof_precision": float(best_row["precision"]),
    "oof_recall": float(best_row["recall"]),
    "oof_f1": float(best_row["f1"]),
    "pinned_test_threshold": 0.10
}
with open(os.path.join(OUTDIR, "text_selected_threshold.json"), "w") as f:
    json.dump(selected, f, indent=2)

print("Saved:")
print(" -", tr_oof_path)
print(" -", te_out_path)
print(" -", thr_oof_path, " + text_selected_threshold.json")


c. Fusion model (Catboost + TF-IDF LR)

In [None]:
import os, json, hashlib
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

BASE   = "/content/drive/MyDrive/CIS_678_final_project"
CB_DIR = os.path.join(BASE, "catboost_rich6h")       
OUTDIR = os.path.join(BASE, "fusion_level2")
os.makedirs(OUTDIR, exist_ok=True)

DATA = Path(BASE)

def read_tabular(fname):  
    return pd.read_csv(DATA/fname, parse_dates=["t"], na_values=["NULL"])

def read_text(fname):
    return pd.read_csv(DATA/fname, na_values=["NULL"])

def report_metrics(y_true, p):
    return {
        "AUROC": roc_auc_score(y_true, p),
        "AUPRC": average_precision_score(y_true, p)
    }

def mk_row_id(df):
    ts = pd.to_datetime(df["t"], errors="coerce").astype(str)
    keys = (df["subject_id"].astype(str) + "|" +
            df["hadm_id"].astype(str)    + "|" +
            df["icustay_id"].astype(str) + "|" +
            ts)
    return keys.apply(lambda s: hashlib.md5(s.encode()).hexdigest())



cb_tr_path = os.path.join(CB_DIR, "cb_level1_oof_train_single.csv")
cb_te_path = os.path.join(CB_DIR, "cb_level1_preds_test_single.csv")
tx_tr_path = os.path.join(BASE, "oof_text_hash_tfidf_lr.csv")
tx_te_path = os.path.join(BASE, "pred_text_hash_tfidf_lr_test.csv")

cb_tr = pd.read_csv(cb_tr_path, parse_dates=["t"])
cb_te = pd.read_csv(cb_te_path, parse_dates=["t"])
tx_tr = pd.read_csv(tx_tr_path, parse_dates=["t"])
tx_te = pd.read_csv(tx_te_path, parse_dates=["t"])

# ensure row_id
for df in (cb_tr, cb_te, tx_tr, tx_te):
    if "row_id" not in df.columns:
        df["row_id"] = mk_row_id(df)


tr = cb_tr.merge(tx_tr[["row_id","p_text"]], on="row_id", how="left")
te = cb_te.merge(tx_te[["row_id","p_text"]], on="row_id", how="left")

# basic sanity
assert {"subject_id","hadm_id","icustay_id","t","label6h","p_cb_oof"}.issubset(tr.columns), "Missing columns in cb OOF train"
assert {"subject_id","hadm_id","icustay_id","t","label6h","p_cb_test"}.issubset(te.columns), "Missing columns in cb TEST"


tr = tr.rename(columns={"p_cb_oof": "p_cb"})  
te = te.rename(columns={"p_cb_test": "p_cb"}) 

# has_text + neutral imputation
tr["has_text"] = (~tr["p_text"].isna()).astype(int)
te["has_text"] = (~te["p_text"].isna()).astype(int)

y_tr = tr["label6h"].astype(int).to_numpy()
y_te = te["label6h"].astype(int).to_numpy()
prior = float(y_tr.mean())

tr["p_text"] = tr["p_text"].fillna(prior)
te["p_text"] = te["p_text"].fillna(prior)

# save aligned inputs
tr.to_csv(os.path.join(OUTDIR, "aligned_level1_train.csv"), index=False)
te.to_csv(os.path.join(OUTDIR, "aligned_level1_test.csv"), index=False)

-
FUSED_FEATS = ["p_cb", "p_text", "has_text"]  
X_tr = tr[FUSED_FEATS].replace([np.inf,-np.inf], np.nan).fillna(0.0)
X_te = te[FUSED_FEATS].replace([np.inf,-np.inf], np.nan).fillna(0.0)

# tiny extra guard
assert list(X_tr.columns) == list(X_te.columns), "Train/Test feature columns differ"


tvals = pd.to_datetime(tr["t"]).to_numpy()
order = np.argsort(tvals)
K = 5
fold_ids = np.full(len(tr), -1, int)
for k, idx in enumerate(np.array_split(order, K)):
    fold_ids[idx] = k

oof_fused = np.full(len(tr), np.nan, float)
for k in range(1, K):
    va_idx = np.where(fold_ids == k)[0]
    tr_pool_idx = np.where(fold_ids <  k)[0]
    # HADM disjoint
    va_hadm = set(tr.loc[va_idx, "hadm_id"])
    tr_idx = tr_pool_idx[~tr.loc[tr_pool_idx, "hadm_id"].isin(va_hadm)]
    if len(tr_idx)==0 or len(va_idx)==0:
        continue
    meta = LogisticRegression(solver="liblinear", penalty="l2", C=1.0,
                              class_weight="balanced", max_iter=1000, random_state=42)
    meta.fit(X_tr.iloc[tr_idx], y_tr[tr_idx])
    oof_fused[va_idx] = meta.predict_proba(X_tr.iloc[va_idx])[:,1]

used = ~np.isnan(oof_fused)
print("FUSION OOF coverage:", used.sum(), "/", len(tr))
if used.sum() == 0:
    raise RuntimeError("No OOF_fused produced; check joins/columns.")

print("FUSION OOF AUROC/AUPRC:",
      roc_auc_score(y_tr[used], oof_fused[used]),
      average_precision_score(y_tr[used], oof_fused[used]))


meta_full = LogisticRegression(solver="liblinear", penalty="l2", C=1.0,
                               class_weight="balanced", max_iter=1000, random_state=42).fit(X_tr, y_tr)
p_fused_test = meta_full.predict_proba(X_te)[:,1]
print("FUSION TEST AUROC/AUPRC:",
      roc_auc_score(y_te, p_fused_test),
      average_precision_score(y_te, p_fused_test))


def sweep_thresholds(y_true, p, grid=None):
    grid = grid or np.linspace(0.05, 0.95, 19)
    rows=[]
    for thr in grid:
        yhat = (p >= thr).astype(int)
        rows.append({"threshold": float(thr),
                     "precision": precision_score(y_true, yhat, zero_division=0),
                     "recall":    recall_score(y_true, yhat, zero_division=0),
                     "f1":        f1_score(y_true, yhat, zero_division=0)})
    return pd.DataFrame(rows)

thr_table = sweep_thresholds(y_tr[used], oof_fused[used])
if thr_table.empty:
    raise RuntimeError("Threshold table is empty; check OOF predictions.")

best_row = thr_table.sort_values(["f1","recall","precision"], ascending=False).iloc[0]
BEST_THR = float(best_row["threshold"])
print("\n[OOF_fused] threshold sweep:\n", thr_table)
print(f"\nSelected THR={BEST_THR:.2f} | "
      f"P={float(best_row['precision']):.3f} R={float(best_row['recall']):.3f} F1={float(best_row['f1']):.3f}")


yhat = (p_fused_test >= BEST_THR).astype(int)
tn, fp, fn, tp = confusion_matrix(y_te, yhat, labels=[0,1]).ravel()
print("\nTEST @ OOF_fused threshold:",
      {"threshold": BEST_THR, "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp),
       "precision": precision_score(y_te, yhat, zero_division=0),
       "recall":    recall_score(y_te, yhat, zero_division=0),
       "f1":        f1_score(y_te, yhat, zero_division=0)})

# save artifacts
tr_out = tr.loc[used, ["subject_id","hadm_id","icustay_id","t","label6h","row_id"]].copy()
tr_out["p_fused_oof"] = oof_fused[used]
tr_out.to_csv(os.path.join(OUTDIR, "oof_fused_train.csv"), index=False)

te_out = te[["subject_id","hadm_id","icustay_id","t","label6h","row_id"]].copy()
te_out["p_fused_test"] = p_fused_test
te_out.to_csv(os.path.join(OUTDIR, "pred_fused_test.csv"), index=False)

thr_table.to_csv(os.path.join(OUTDIR, "fusion_oof_thresholds.csv"), index=False)
with open(os.path.join(OUTDIR, "fusion_selected_threshold.json"), "w") as f:
    json.dump({
        "best_threshold": BEST_THR,
        "oof_precision": float(best_row["precision"]),
        "oof_recall": float(best_row["recall"]),
        "oof_f1": float(best_row["f1"])
    }, f, indent=2)

print("\nSaved fusion files to:", OUTDIR)