Deep Learning Model

a. ClinicalBERT model for notes

In [None]:
import pandas as pd
import numpy as np

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

save_dir = "/content/drive/MyDrive/CIS_678_final_project/"

from pathlib import Path
DATA = Path('/content/drive/MyDrive/CIS_678_final_project')

import pandas as pd
def read_tabular(fname):  
    return pd.read_csv(DATA/fname, parse_dates=["t"], na_values=["NULL"])

def read_text(fname):
    return pd.read_csv(DATA/fname, na_values=["NULL"])

from sklearn.metrics import roc_auc_score, average_precision_score
def report_metrics(y_true, p):
    auroc = roc_auc_score(y_true, p)
    auprc = average_precision_score(y_true, p)
    return {"AUROC": auroc, "AUPRC": auprc}

In [None]:
!pip install -q "transformers==4.44.2" "accelerate>=0.33.0" "datasets>=2.20.0" "evaluate>=0.4.1"

In [None]:
import os, numpy as np, pandas as pd, torch, evaluate, hashlib
from datasets import Dataset, Value
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score
)


OUTDIR    = "/content/drive/MyDrive/CIS_678_final_project"
os.makedirs(OUTDIR, exist_ok=True)

MODEL      = "emilyalsentzer/Bio_ClinicalBERT"
TRAIN_FILE = "notes_train_sample.csv.csv"
TEST_FILE  = "notes_test_sample.csv.csv"

BASE_TEXT_COL = "notes_24h"   
LABEL_COL     = "label6h"
ID_COLS       = ["subject_id","hadm_id","icustay_id","t"]

MAX_LEN    = 256
SEED       = 42
set_seed(SEED)
os.environ["WANDB_DISABLED"] = "true"


def read_text(fname):
    return pd.read_csv(os.path.join(OUTDIR, fname))

raw_tr = read_text(TRAIN_FILE)
raw_te = read_text(TEST_FILE)

# keep base text + label + IDs
keep_cols = [BASE_TEXT_COL, LABEL_COL] + ID_COLS
tr = raw_tr[keep_cols].copy()
te = raw_te[keep_cols].copy()

for df in (tr, te):
    df[BASE_TEXT_COL] = df[BASE_TEXT_COL].fillna("")
    df["t"] = pd.to_datetime(df["t"])


GROUP = ["subject_id","hadm_id","icustay_id"]

def add_text_lags(df):
    df = df.sort_values(GROUP + ["t"]).copy()
    g = df.groupby(GROUP, sort=False)
    df["notes_prev1"] = g[BASE_TEXT_COL].shift(1).fillna("")
    df["notes_prev2"] = g[BASE_TEXT_COL].shift(2).fillna("")
    df["notes_combo"] = (
        df["notes_prev2"] + " [SEP] " +
        df["notes_prev1"] + " [SEP] " +
        df[BASE_TEXT_COL]
    )
    return df

tr = add_text_lags(tr)
te = add_text_lags(te)

TEXT_COL = "notes_combo"   


patient_time = tr.groupby("subject_id")["t"].min().sort_values()
patients_ordered = patient_time.index.to_numpy()

K = 5
patient_folds = np.full(len(patients_ordered), -1, int)
for k, idx in enumerate(np.array_split(np.arange(len(patients_ordered)), K)):
    patient_folds[idx] = k

val_fold = K - 1
val_patients   = set(patients_ordered[patient_folds == val_fold])
train_patients = set(patients_ordered[patient_folds <  val_fold])

tr_train = tr[tr["subject_id"].isin(train_patients)].copy().reset_index(drop=True)
tr_val   = tr[tr["subject_id"].isin(val_patients)].copy().reset_index(drop=True)

assert set(tr_train["subject_id"]).isdisjoint(set(tr_val["subject_id"]))
assert set(tr_train["hadm_id"]).isdisjoint(set(tr_val["hadm_id"]))
assert set(tr_train["icustay_id"]).isdisjoint(set(tr_val["icustay_id"]))

print("Train rows:", len(tr_train), " Val rows:", len(tr_val), " Test rows:", len(te))


tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tok(batch):
    return tokenizer(
        batch[TEXT_COL],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

def to_hf(ds_df: pd.DataFrame) -> Dataset:
    df = ds_df.rename(columns={LABEL_COL: "labels"}).reset_index(drop=True)
    ds = Dataset.from_pandas(df, preserve_index=False)
    # remove text + ID columns 
    drop_cols = [TEXT_COL] + [c for c in ID_COLS if c in ds.column_names] + ["__index_level_0__"]
    ds = ds.map(tok, batched=True, remove_columns=[c for c in drop_cols if c in ds.column_names])
    if "labels" in ds.features and ds.features["labels"].dtype != "int64":
        ds = ds.cast_column("labels", Value("int64"))
    ds.set_format("torch")
    return ds

ds_tr = to_hf(tr_train)
ds_va = to_hf(tr_val)
ds_te = to_hf(te)


model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

pos_rate = float((tr_train[LABEL_COL] == 1).mean())
w_pos = (1.0 - pos_rate) / max(pos_rate, 1e-6)
print("Train positive rate:", pos_rate, "  pos_weight:", w_pos)

class MyTrainer(Trainer):
    def __init__(self, *args, w_pos=1.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor([1.0, w_pos], dtype=torch.float32)
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

metric_auc = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    auroc = metric_auc.compute(prediction_scores=probs, references=labels)["roc_auc"]
    auprc = average_precision_score(labels, probs)
    return {"AUROC": auroc, "AUPRC": auprc}

args = TrainingArguments(
    output_dir=os.path.join(OUTDIR, "bert_out"),
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="AUPRC",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=100,
    seed=SEED,
    report_to="none",
    save_safetensors=False,
    overwrite_output_dir=True,
)

trainer = MyTrainer(
    model=model,
    args=args,
    train_dataset=ds_tr,
    eval_dataset=ds_va,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    w_pos=w_pos,
)

trainer.train()


def get_probs(ds):
    preds = trainer.predict(ds)
    logits = preds.predictions
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    return probs

p_tr = get_probs(ds_tr)
p_va = get_probs(ds_va)
p_te = get_probs(ds_te)

def quick_metrics(y, p):
    return dict(AUROC=roc_auc_score(y, p), AUPRC=average_precision_score(y, p))

print("ClinicalBERT TRAIN(split):", quick_metrics(tr_train[LABEL_COL].values, p_tr))
print("ClinicalBERT VAL(split):  ", quick_metrics(tr_val[LABEL_COL].values,   p_va))
print("ClinicalBERT TEST:        ", quick_metrics(te[LABEL_COL].values,       p_te))


def threshold_sweep(y, p, split_name):
    rows = []
    for th in [i/10 for i in range(1, 10)]: 
        yhat = (p >= th).astype(int)
        pr = precision_score(y, yhat, zero_division=0)
        rc = recall_score(y, yhat, zero_division=0)
        f1 = f1_score(y, yhat, zero_division=0)
        rows.append({"split": split_name, "threshold": th,
                     "precision": pr, "recall": rc, "f1": f1})
    return pd.DataFrame(rows)

df_thr = pd.concat([
    threshold_sweep(tr_train[LABEL_COL].values, p_tr, "train"),
    threshold_sweep(tr_val[LABEL_COL].values,   p_va, "val"),
    threshold_sweep(te[LABEL_COL].values,       p_te, "test"),
], axis=0)

print("\nThreshold sweep (0.1..0.9):")
print(df_thr.pivot_table(index=["split","threshold"], values=["precision","recall","f1"]))

df_thr.to_csv(os.path.join(OUTDIR, "bert_threshold_sweep_prf1.csv"), index=False)


# save probs for fusion 
def mk_row_id(df: pd.DataFrame) -> pd.Series:
    keys = (
        df["subject_id"].astype(str) + "|" +
        df["hadm_id"].astype(str)    + "|" +
        df["icustay_id"].astype(str) + "|" +
        df["t"].astype(str)
    )
    return keys.apply(lambda s: hashlib.md5(s.encode()).hexdigest())

# train-split predictions
df_tr_pred = pd.concat(
    [
        tr_train[ID_COLS].reset_index(drop=True),
        pd.Series(p_tr, name="p_text_bert"),
        tr_train[LABEL_COL].reset_index(drop=True).rename(LABEL_COL),
    ],
    axis=1,
)
df_tr_pred["row_id"] = mk_row_id(df_tr_pred)
df_tr_pred["split"] = "train"

# val-split predictions
df_va_pred = pd.concat(
    [
        tr_val[ID_COLS].reset_index(drop=True),
        pd.Series(p_va, name="p_text_bert"),
        tr_val[LABEL_COL].reset_index(drop=True).rename(LABEL_COL),
    ],
    axis=1,
)
df_va_pred["row_id"] = mk_row_id(df_va_pred)
df_va_pred["split"] = "val"

df_train_full = pd.concat([df_tr_pred, df_va_pred], ignore_index=True)
df_train_full.to_csv(os.path.join(OUTDIR, "oof_text_bert_train_full.csv"), index=False)

# test predictions
df_te_pred = pd.concat(
    [
        te[ID_COLS].reset_index(drop=True),
        pd.Series(p_te, name="p_text_bert"),
        te[LABEL_COL].reset_index(drop=True).rename(LABEL_COL),
    ],
    axis=1,
)
df_te_pred["row_id"] = mk_row_id(df_te_pred)
df_te_pred["split"] = "test"
df_te_pred.to_csv(os.path.join(OUTDIR, "pred_text_bert_test.csv"), index=False)

print("\nSaved:")
print(" - oof_text_bert_train_full.csv  (train + val merged, for fusion)")
print(" - pred_text_bert_test.csv       (test probs)")
print(" - bert_threshold_sweep_prf1.csv")


b. MLP Fusion model (CatBoost + CinicalBERT notes)

In [None]:
import os, json, hashlib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score, confusion_matrix
)

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


BASE   = "/content/drive/MyDrive/CIS_678_final_project"
CB_DIR = os.path.join(BASE, "catboost_rich6h") 
OUTDIR = os.path.join(BASE, "fusion_cb_bert_mlp")
os.makedirs(OUTDIR, exist_ok=True)

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def mk_row_id(df: pd.DataFrame) -> pd.Series:
    ts = pd.to_datetime(df["t"], errors="coerce").astype(str)
    keys = (df["subject_id"].astype(str) + "|" +
            df["hadm_id"].astype(str)    + "|" +
            df["icustay_id"].astype(str) + "|" +
            ts)
    return keys.apply(lambda s: hashlib.md5(s.encode()).hexdigest())


cb_tr_path = os.path.join(CB_DIR, "cb_level1_oof_train_single.csv")
cb_te_path = os.path.join(CB_DIR, "cb_level1_preds_test_single.csv")

cb_tr = pd.read_csv(cb_tr_path, parse_dates=["t"])
cb_te = pd.read_csv(cb_te_path, parse_dates=["t"])


cb_tr = cb_tr.rename(columns={"p_cb_oof": "p_cb"})
cb_te = cb_te.rename(columns={"p_cb_test": "p_cb"})


for df in (cb_tr, cb_te):
    if "row_id" not in df.columns:
        df["row_id"] = mk_row_id(df)

bert_tr_path = os.path.join(BASE, "oof_text_bert_train_full.csv")
bert_te_path = os.path.join(BASE, "pred_text_bert_test.csv")

bert_tr = pd.read_csv(bert_tr_path, parse_dates=["t"])
bert_te = pd.read_csv(bert_te_path, parse_dates=["t"])

for df in (bert_tr, bert_te):
    if "row_id" not in df.columns:
        df["row_id"] = mk_row_id(df)


tr = cb_tr.merge(
    bert_tr[["row_id", "p_text_bert"]],
    on="row_id",
    how="left"
)

te = cb_te.merge(
    bert_te[["row_id", "p_text_bert"]],
    on="row_id",
    how="left"
)


for col in ["subject_id","hadm_id","icustay_id","t","label6h","p_cb"]:
    assert col in tr.columns, f"Missing {col} in train fusion table"
    assert col in te.columns, f"Missing {col} in test fusion table"


tr["has_text"] = (~tr["p_text_bert"].isna()).astype(int)
te["has_text"] = (~te["p_text_bert"].isna()).astype(int)

y_tr = tr["label6h"].astype(int).to_numpy()
y_te = te["label6h"].astype(int).to_numpy()
prior = float(y_tr.mean())

# Neutral imputation for missing text probs
tr["p_text_bert"] = tr["p_text_bert"].fillna(prior)
te["p_text_bert"] = te["p_text_bert"].fillna(prior)

FUSED_FEATS = ["p_cb", "p_text_bert", "has_text"]

X_tr = tr[FUSED_FEATS].replace([np.inf,-np.inf], np.nan).fillna(0.0).to_numpy().astype("float32")
X_te = te[FUSED_FEATS].replace([np.inf,-np.inf], np.nan).fillna(0.0).to_numpy().astype("float32")

print("Fusion features:", FUSED_FEATS)
print("Train shape:", X_tr.shape, " Test shape:", X_te.shape)


# time-aware, HADM-disjoint folds for OOF
tvals = pd.to_datetime(tr["t"]).to_numpy()
order = np.argsort(tvals)
K = 5

fold_ids = np.full(len(tr), -1, int)
for k, idx in enumerate(np.array_split(order, K)):
    fold_ids[idx] = k


# small MLP model
class FusionMLP(nn.Module):
    def __init__(self, in_dim=3, hidden1=16, hidden2=8, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, 1),  # logits
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)  # (batch,)

def train_one_mlp(X_train, y_train, X_val, y_val,
                  epochs=8, batch_size=2048, lr=1e-3):
    # pos_weight for imbalance
    pos_rate = float(y_train.mean())
    pos_weight = (1.0 - pos_rate) / max(pos_rate, 1e-8)
    pos_weight_t = torch.tensor(pos_weight, dtype=torch.float32, device=device)

    model = FusionMLP(in_dim=X_train.shape[1]).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)

    X_tr_t = torch.from_numpy(X_train).to(device)
    y_tr_t = torch.from_numpy(y_train.astype("float32")).to(device)
    ds_tr = TensorDataset(X_tr_t, y_tr_t)
    dl_tr = DataLoader(ds_tr, batch_size=batch_size, shuffle=True, drop_last=False)

    X_va_t = torch.from_numpy(X_val).to(device)
    y_va_t = torch.from_numpy(y_val.astype("float32")).to(device)

    for ep in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        for xb, yb in dl_tr:
            opt.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            opt.step()
            total_loss += loss.item() * len(xb)
        total_loss /= len(ds_tr)

        # quick val AUROC/AUPRC each epoch
        model.eval()
        with torch.no_grad():
            logits_va = model(X_va_t)
            p_va = torch.sigmoid(logits_va).cpu().numpy()
        try:
            auroc = roc_auc_score(y_val, p_va)
            auprc = average_precision_score(y_val, p_va)
        except ValueError:
            auroc, auprc = np.nan, np.nan

        print(f"  [ep {ep}] train_loss={total_loss:.4f}  val_AUROC={auroc:.4f}  val_AUPRC={auprc:.4f}")

    # Return model + val probs
    model.eval()
    with torch.no_grad():
        logits_va = model(X_va_t)
        p_va = torch.sigmoid(logits_va).cpu().numpy()
    return model, p_va

def metrics(y, p):
    return {
        "AUROC": roc_auc_score(y, p),
        "AUPRC": average_precision_score(y, p),
    }


oof_fused = np.full(len(tr), np.nan, float)

for k in range(1, K):
    va_idx = np.where(fold_ids == k)[0]
    tr_pool_idx = np.where(fold_ids < k)[0]

    # HADM-disjoint
    va_hadm = set(tr.loc[va_idx, "hadm_id"])
    mask = ~tr.loc[tr_pool_idx, "hadm_id"].isin(va_hadm)
    tr_idx = tr_pool_idx[mask.to_numpy()]

    if len(tr_idx) == 0 or len(va_idx) == 0:
        print(f"[FusionMLP] Skip chunk {k}: tr={len(tr_idx)} va={len(va_idx)}")
        continue

    print(f"\n[FusionMLP] Training chunk {k}: tr={len(tr_idx)}, va={len(va_idx)}")
    X_tr_fold = X_tr[tr_idx]
    y_tr_fold = y_tr[tr_idx]
    X_va_fold = X_tr[va_idx]
    y_va_fold = y_tr[va_idx]

    model_k, p_va = train_one_mlp(X_tr_fold, y_tr_fold, X_va_fold, y_va_fold,
                                  epochs=8, batch_size=2048, lr=1e-3)

    oof_fused[va_idx] = p_va
    print(f"[FusionMLP] chunk {k} metrics:", metrics(y_va_fold, p_va))

used = ~np.isnan(oof_fused)
print("\nFUSION-MLP OOF coverage:", used.sum(), "/", len(tr))
y_oof = tr.loc[used, "label6h"].astype(int).to_numpy()
p_oof = oof_fused[used]

print("FUSION-MLP OOF AUROC/AUPRC:",
      roc_auc_score(y_oof, p_oof),
      average_precision_score(y_oof, p_oof))

def sweep_thresholds(y_true, p, grid=None):
    grid = grid or np.linspace(0.05, 0.95, 19)
    rows = []
    for thr in grid:
        yhat = (p >= thr).astype(int)
        rows.append({
            "threshold": float(thr),
            "precision": precision_score(y_true, yhat, zero_division=0),
            "recall":    recall_score(y_true, yhat, zero_division=0),
            "f1":        f1_score(y_true, yhat, zero_division=0),
        })
    return pd.DataFrame(rows)

thr_table = sweep_thresholds(y_tr[used], oof_fused[used])
best_row = thr_table.sort_values(["f1","recall","precision"], ascending=False).iloc[0]
BEST_THR = float(best_row["threshold"])

print("\n[FUSION-MLP OOF] threshold sweep:")
print(thr_table)
print(f"\nSelected THR={BEST_THR:.2f} | "
      f"P={best_row['precision']:.3f} R={best_row['recall']:.3f} F1={best_row['f1']:.3f}")


print("\n[FusionMLP] Training final model on full train...")

final_model, _ = train_one_mlp(X_tr, y_tr, X_tr, y_tr,
                               epochs=8, batch_size=2048, lr=1e-3)

final_model.eval()
with torch.no_grad():
    logits_te = final_model(torch.from_numpy(X_te).to(device))
    p_fused_test = torch.sigmoid(logits_te).cpu().numpy()

print("\nFUSION-MLP TEST AUROC/AUPRC:",
      roc_auc_score(y_te, p_fused_test),
      average_precision_score(y_te, p_fused_test))

yhat = (p_fused_test >= BEST_THR).astype(int)
tn, fp, fn, tp = confusion_matrix(y_te, yhat, labels=[0,1]).ravel()
print("\nTEST @ FUSION-MLP OOF threshold:",
      {"threshold": BEST_THR,
       "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp),
       "precision": precision_score(y_te, yhat, zero_division=0),
       "recall":    recall_score(y_te, yhat, zero_division=0),
       "f1":        f1_score(y_te, yhat, zero_division=0)})

# save 
tr_oof = tr.loc[used, ["subject_id","hadm_id","icustay_id","t","label6h","row_id"]].copy()
tr_oof["p_fused_oof"] = oof_fused[used]
tr_oof.to_csv(os.path.join(OUTDIR, "oof_fused_cb_bert_mlp_train.csv"), index=False)

# tEST predictions
te_out = te[["subject_id","hadm_id","icustay_id","t","label6h","row_id"]].copy()
te_out["p_fused_test"] = p_fused_test
te_out.to_csv(os.path.join(OUTDIR, "pred_fused_cb_bert_mlp_test.csv"), index=False)

thr_table.to_csv(os.path.join(OUTDIR, "fusion_cb_bert_mlp_oof_thresholds.csv"), index=False)
with open(os.path.join(OUTDIR, "fusion_cb_bert_mlp_selected_threshold.json"), "w") as f:
    json.dump({
        "best_threshold": BEST_THR,
        "oof_precision": float(best_row["precision"]),
        "oof_recall": float(best_row["recall"]),
        "oof_f1": float(best_row["f1"]),
    }, f, indent=2)

print("\nSaved fusion files to:", OUTDIR)