In [1]:
# Kaggle config (KHÔNG dò đường, dùng đúng tên dataset)
from pathlib import Path
import pandas as pd

# ---- Đặt đúng tên dataset như bạn đã thêm vào notebook ----
BASE_INPUT = Path("/kaggle/input/mimic-iv-proc-revita-2025-09-222")   # <-- tên đúng như bạn nói
PROC = BASE_INPUT                                     # file unified ở GỐC dataset

# ---- File unified ----
UNIFIED_PQT = PROC / "train_unified.parquet"
UNIFIED_CSV = PROC / "train_unified.csv"

# ==== Tham số bạn đang dùng ====
MIN_LABEL_FREQ = 100
MAX_LABELS = 3000
LIMIT_TRAIN = 200000

# Vector hoá
N_FEATURES = 2**18
USE_CHAR_NGRAMS = False
NGRAM_WORD = (1, 2)
NGRAM_CHAR = (3, 5)
MAX_TOKENS_PER_DOC = 4000

# Train loop
BATCH_SIZE = 512
EPOCHS = 1
LOG_EVERY = 20
SEED = 42

# Checkpoint/model (ghi được ở Kaggle)
CKPT_PATH = Path("/kaggle/working") / "ovr_lazy_sgd.joblib"

# ---- Load unified ----
if UNIFIED_PQT.exists():
    df = pd.read_parquet(UNIFIED_PQT)
elif UNIFIED_CSV.exists():
    df = pd.read_csv(UNIFIED_CSV)
else:
    raise FileNotFoundError("Không thấy train_unified.{parquet|csv} ở gốc dataset 'mimic iv dataset'.")

print({
    "dataset_dir": str(BASE_INPUT),
    "CKPT_PATH": str(CKPT_PATH),
    "Loaded": df.shape,
    "MIN_LABEL_FREQ": MIN_LABEL_FREQ,
    "MAX_LABELS": MAX_LABELS,
    "N_FEATURES": N_FEATURES,
    "BATCH_SIZE": BATCH_SIZE,
})


{'dataset_dir': '/kaggle/input/mimic-iv-proc-revita-2025-09-222', 'CKPT_PATH': '/kaggle/working/ovr_lazy_sgd.joblib', 'Loaded': (331062, 6), 'MIN_LABEL_FREQ': 100, 'MAX_LABELS': 3000, 'N_FEATURES': 262144, 'BATCH_SIZE': 512}


In [2]:
# ƯU TIÊN tái dùng file tần suất đã xuất sẵn; nếu không có thì tính từ df
import pandas as pd

KEEP_LABELS = None
freq_csv = (PROC/"top_icd_coverage.csv")
if not freq_csv.exists():
    freq_csv = (PROC/"icd_hadm_freq.csv")

if freq_csv.exists():
    freq = pd.read_csv(freq_csv)
    # Chuẩn hoá tên cột
    if "icd_full" not in freq.columns:
        if freq.columns.tolist() == ["index","hadm_freq"]:
            freq = freq.rename(columns={"index":"icd_full"})
        else:
            freq.columns = ["icd_full","hadm_freq"]
    keep_df = (freq[freq["hadm_freq"] >= MIN_LABEL_FREQ]
               .sort_values("hadm_freq", ascending=False)
               .head(MAX_LABELS))
    KEEP_LABELS = set(keep_df["icd_full"].tolist())
    print(f"Reuse nhãn từ {freq_csv.name}: {len(KEEP_LABELS)} labels")
else:
    # fallback: lấy từ chính df unified
    from collections import Counter
    codes = df["icd_codes"].str.split(";")
    cnt = Counter(c for row in codes for c in row)
    keep = [c for c,n in cnt.items() if n >= MIN_LABEL_FREQ]
    keep = sorted(keep, key=lambda c: cnt[c], reverse=True)[:MAX_LABELS]
    KEEP_LABELS = set(keep)
    print(f"Tính nhãn từ unified: {len(KEEP_LABELS)} labels")


Reuse nhãn từ top_icd_coverage.csv: 3000 labels


In [3]:
from sklearn.model_selection import GroupShuffleSplit

codes = df["icd_codes"].str.split(";")
mask = codes.map(lambda L: any(c in KEEP_LABELS for c in L))
df = df.loc[mask].copy()
df["labels"] = codes.map(lambda L: [c for c in L if c in KEEP_LABELS])

# thử nhanh (nếu muốn)
if LIMIT_TRAIN is not None and len(df) > LIMIT_TRAIN:
    df = df.sample(LIMIT_TRAIN, random_state=SEED).reset_index(drop=True)

# split theo subject_id (không rò rỉ bệnh nhân)
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=SEED)
idx_tr, idx_te = next(gss1.split(df, groups=df["subject_id"]))
train_val = df.iloc[idx_tr].reset_index(drop=True)
test = df.iloc[idx_te].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.15/(1-0.15), random_state=SEED)
idx_tr2, idx_va = next(gss2.split(train_val, groups=train_val["subject_id"]))
train = train_val.iloc[idx_tr2].reset_index(drop=True)
val   = train_val.iloc[idx_va].reset_index(drop=True)

print("Split sizes:", len(train), len(val), len(test))


Split sizes: 69 16 15


In [6]:
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import hstack

def truncate_tokens(text, mx=MAX_TOKENS_PER_DOC):
    return " ".join(str(text).split()[:mx])

hv_word = HashingVectorizer(
    n_features=N_FEATURES,
    ngram_range=NGRAM_WORD,
    analyzer="word",
    alternate_sign=False,
    norm="l2",
    dtype=np.float32,
    preprocessor=lambda t: truncate_tokens(t, MAX_TOKENS_PER_DOC),
)

hv_char = None
if USE_CHAR_NGRAMS:
    hv_char = HashingVectorizer(
        n_features=N_FEATURES//2,
        ngram_range=NGRAM_CHAR,
        analyzer="char",
        alternate_sign=False,
        norm="l2",
        dtype=np.float32,
        preprocessor=lambda t: truncate_tokens(t, MAX_TOKENS_PER_DOC),
    )

def X_from_text(series):
    Xw = hv_word.transform(series)
    if hv_char is None:
        return Xw.tocsr()
    Xc = hv_char.transform(series)
    return hstack([Xw, Xc], format="csr")


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

mlb = MultiLabelBinarizer()
_ = mlb.fit(train["labels"])   # chỉ để cố định trật tự nhãn

def eval_f1k_pk_streaming(model, mlb, df_eval, K=5, batch_size=512):
    tp=fp=fn=correct_k_total=total_preds=0
    n=len(df_eval)
    for i in range(0, n, batch_size):
        j=min(i+batch_size, n)
        Xb = X_from_text(df_eval["text_clean"].iloc[i:j])
        Yb = mlb.transform(df_eval["labels"].iloc[i:j]).astype(np.int8)
        Pb = model.predict_proba(Xb)
        topk_idx = np.argsort(-Pb, axis=1)[:, :K]
        pred_b = np.zeros_like(Pb, dtype=np.int8)
        rows = np.arange(Pb.shape[0])[:, None]
        pred_b[rows, topk_idx] = 1
        tp += int((pred_b & Yb).sum())
        fp += int((pred_b & (1 - Yb)).sum())
        fn += int(((1 - pred_b) & Yb).sum())
        correct_k_total += int((pred_b & Yb).sum())
        total_preds += pred_b.shape[0] * K
        del Xb, Yb, Pb, pred_b
    prec = tp / (tp + fp + 1e-12)
    rec  = tp / (tp + fn + 1e-12)
    f1k  = 2 * prec * rec / (prec + rec + 1e-12)
    pk   = correct_k_total / (total_preds + 1e-12)
    return {"f1@{}".format(K): f1k, "p@{}".format(K): pk}

# đánh giá chỉ trên các nhãn đã được huấn luyện (phòng case LIMIT_TRAIN nhỏ)
def eval_on_trained_labels(model, mlb, df_eval, K=5, batch_size=512):
    trained_idx = sorted(getattr(model, "trained_label_indices", range(len(mlb.classes_))))
    trained_labels = [mlb.classes_[i] for i in trained_idx]
    eval_mlb = MultiLabelBinarizer(classes=trained_labels)
    eval_mlb.fit([trained_labels])
    tp=fp=fn=correct_k_total=total_preds=0
    n=len(df_eval)
    for i in range(0, n, batch_size):
        j=min(i+batch_size, n)
        Xb = X_from_text(df_eval["text_clean"].iloc[i:j])
        Yb = eval_mlb.transform(df_eval["labels"].iloc[i:j]).astype(np.int8)
        Pb_full = model.predict_proba(Xb)
        Pb = Pb_full[:, trained_idx]
        topk_idx = np.argsort(-Pb, axis=1)[:, :K]
        pred_b = np.zeros_like(Pb, dtype=np.int8)
        rows = np.arange(Pb.shape[0])[:, None]
        pred_b[rows, topk_idx] = 1
        tp += int((pred_b & Yb).sum())
        fp += int((pred_b & (1 - Yb)).sum())
        fn += int(((1 - pred_b) & Yb).sum())
        correct_k_total += int((pred_b & Yb).sum())
        total_preds += pred_b.shape[0] * K
        del Xb, Yb, Pb_full, Pb, pred_b
    prec = tp / (tp + fp + 1e-12)
    rec  = tp / (tp + fn + 1e-12)
    f1k  = 2 * prec * rec / (prec + rec + 1e-12)
    pk   = correct_k_total / (total_preds + 1e-12)
    return {"f1@{}".format(K): f1k, "p@{}".format(K): pk}


In [9]:
import numpy as np, time, joblib, gc
from sklearn.linear_model import SGDClassifier

class LazyOVR:
    """
    - Mỗi nhãn là 1 SGDClassifier nhị phân, chỉ tạo khi nhãn xuất hiện trong batch.
    - predict_proba: chỉ tính cho head đã train; nhãn chưa train -> 0.0.
    """
    def __init__(self, n_labels, alpha=1e-5, random_state=42):
        self.n_labels = n_labels
        self.alpha = alpha
        self.random_state = random_state
        self.heads = {}           # k -> SGDClassifier
        self._init_done = set()   # k đã init
        self.trained_label_indices = set()

    def _get_head(self, k):
        if k not in self.heads:
            self.heads[k] = SGDClassifier(loss="log_loss", penalty="l2", alpha=self.alpha,
                                          learning_rate="optimal", max_iter=1, tol=None,
                                          random_state=self.random_state)
        return self.heads[k]

    def partial_fit(self, X, Y):
        # Y shape: (n_samples, n_labels)
        active = np.where(Y.sum(axis=0) > 0)[0]
        if active.size == 0:
            return self
        classes = np.array([0,1], dtype=np.int8)
        for k in active:
            yk = Y[:, k].astype(np.int8)
            head = self._get_head(k)
            if k not in self._init_done:
                head.partial_fit(X, yk, classes=classes)
                self._init_done.add(k)
            else:
                head.partial_fit(X, yk)
            self.trained_label_indices.add(k)
        return self

    def predict_proba(self, X):
        n = X.shape[0]
        P = np.zeros((n, self.n_labels), dtype=np.float32)
        for k, head in self.heads.items():
            P[:, k] = head.predict_proba(X)[:, 1]
        return P

# ===== Train loop =====
rng = np.random.default_rng(SEED)
def batches_idx(n, bs):
    idx = np.arange(n); rng.shuffle(idx)
    for i in range(0, n, bs):
        yield idx[i:i+bs]

n_labels = len(mlb.classes_)
model = LazyOVR(n_labels=n_labels, alpha=1e-5, random_state=SEED)

start = time.time(); seen = 0
for ep in range(EPOCHS):
    for bi, idx in enumerate(batches_idx(len(train), BATCH_SIZE), 1):
        X_b = X_from_text(train.loc[idx, "text_clean"])
        Y_b = mlb.transform(train.loc[idx, "labels"]).astype(np.int8)

        model.partial_fit(X_b, Y_b)
        seen += len(idx)

        del X_b, Y_b; gc.collect()

        if bi % LOG_EVERY == 0:
            m = eval_on_trained_labels(model, mlb, val, K=5, batch_size=512)
            print(f"[ep {ep+1} | batch {bi}] seen={seen}  F1@5={m['f1@5']:.4f}  P@5={m['p@5']:.4f}  heads={len(model.heads)}  t={time.time()-start:.1f}s")

print("Done in {:.1f}s".format(time.time()-start))

joblib.dump({"heads": model.heads, "mlb": mlb,
             "cfg": {"N_FEATURES": N_FEATURES, "NGRAM_WORD": NGRAM_WORD,
                     "USE_CHAR_NGRAMS": USE_CHAR_NGRAMS,
                     "MAX_TOKENS_PER_DOC": MAX_TOKENS_PER_DOC}},
            CKPT_PATH)
print("Saved:", CKPT_PATH)


Done in 5.5s
Saved: /kaggle/working/ovr_lazy_sgd.joblib


In [10]:
import numpy as np, time, joblib, gc
from sklearn.linear_model import SGDClassifier

class LazyOVR:
    """
    - Mỗi nhãn là 1 SGDClassifier nhị phân, chỉ tạo khi nhãn xuất hiện trong batch.
    - predict_proba: chỉ tính cho head đã train; nhãn chưa train -> 0.0.
    """
    def __init__(self, n_labels, alpha=1e-5, random_state=42):
        self.n_labels = n_labels
        self.alpha = alpha
        self.random_state = random_state
        self.heads = {}           # k -> SGDClassifier
        self._init_done = set()   # k đã init
        self.trained_label_indices = set()

    def _get_head(self, k):
        if k not in self.heads:
            self.heads[k] = SGDClassifier(loss="log_loss", penalty="l2", alpha=self.alpha,
                                          learning_rate="optimal", max_iter=1, tol=None,
                                          random_state=self.random_state)
        return self.heads[k]

    def partial_fit(self, X, Y):
        # Y shape: (n_samples, n_labels)
        active = np.where(Y.sum(axis=0) > 0)[0]
        if active.size == 0:
            return self
        classes = np.array([0,1], dtype=np.int8)
        for k in active:
            yk = Y[:, k].astype(np.int8)
            head = self._get_head(k)
            if k not in self._init_done:
                head.partial_fit(X, yk, classes=classes)
                self._init_done.add(k)
            else:
                head.partial_fit(X, yk)
            self.trained_label_indices.add(k)
        return self

    def predict_proba(self, X):
        n = X.shape[0]
        P = np.zeros((n, self.n_labels), dtype=np.float32)
        for k, head in self.heads.items():
            P[:, k] = head.predict_proba(X)[:, 1]
        return P

# ===== Train loop =====
rng = np.random.default_rng(SEED)
def batches_idx(n, bs):
    idx = np.arange(n); rng.shuffle(idx)
    for i in range(0, n, bs):
        yield idx[i:i+bs]

n_labels = len(mlb.classes_)
model = LazyOVR(n_labels=n_labels, alpha=1e-5, random_state=SEED)

start = time.time(); seen = 0
for ep in range(EPOCHS):
    for bi, idx in enumerate(batches_idx(len(train), BATCH_SIZE), 1):
        X_b = X_from_text(train.loc[idx, "text_clean"])
        Y_b = mlb.transform(train.loc[idx, "labels"]).astype(np.int8)

        model.partial_fit(X_b, Y_b)
        seen += len(idx)

        del X_b, Y_b; gc.collect()

        if bi % LOG_EVERY == 0:
            m = eval_on_trained_labels(model, mlb, val, K=5, batch_size=512)
            print(f"[ep {ep+1} | batch {bi}] seen={seen}  F1@5={m['f1@5']:.4f}  P@5={m['p@5']:.4f}  heads={len(model.heads)}  t={time.time()-start:.1f}s")

print("Done in {:.1f}s".format(time.time()-start))

joblib.dump({"heads": model.heads, "mlb": mlb,
             "cfg": {"N_FEATURES": N_FEATURES, "NGRAM_WORD": NGRAM_WORD,
                     "USE_CHAR_NGRAMS": USE_CHAR_NGRAMS,
                     "MAX_TOKENS_PER_DOC": MAX_TOKENS_PER_DOC}},
            CKPT_PATH)
print("Saved:", CKPT_PATH)


Done in 6.1s
Saved: /kaggle/working/ovr_lazy_sgd.joblib


In [11]:
import numpy as np, joblib

def predict_topk(texts, K=5):
    Xq = X_from_text(pd.Series(texts))
    P  = model.predict_proba(Xq)
    codes = mlb.classes_
    out = []
    for i in range(len(texts)):
        idx = np.argsort(-P[i])[:K]
        out.append([(codes[j], float(P[i,j])) for j in idx])
    return out

demo = [
    "Service: MEDICINE\nHistory: chest pain, HTN, DM, hyperlipidemia...",
    "Service: SURGERY\nPost-op day #2, fever, wound infection, antibiotics..."
]
for i, preds in enumerate(predict_topk(demo, K=5), 1):
    print(f"\nCase {i}:")
    for code, prob in preds:
        print(f"  {code}: {prob:.3f}")

# Lưu lại lần nữa vào working để đảm bảo artifact xuất hiện
joblib.dump({"heads": model.heads, "mlb": mlb}, CKPT_PATH)
print("Model @", CKPT_PATH)



Case 1:
  9-4019: 0.845
  9-42731: 0.745
  9-42789: 0.693
  9-311: 0.671
  9-27651: 0.625

Case 2:
  9-04111: 0.809
  9-99592: 0.790
  9-78552: 0.790
  9-0389: 0.790
  9-V1254: 0.763
Model @ /kaggle/working/ovr_lazy_sgd.joblib


In [12]:
# Xuất top-5 dự đoán cho một batch test nhỏ
out = []
K = 5
for i in range(min(50, len(test))):
    s = test.iloc[i]
    top5 = predict_topk([s["text_clean"]], K=K)[0]
    out.append({
        "subject_id": s["subject_id"],
        "hadm_id": s["hadm_id"],
        "gold": ";".join(s["labels"]),
        "pred_top5": ";".join([f"{c}:{p:.3f}" for c,p in top5])
    })
pd.DataFrame(out).to_csv("/kaggle/working/preds_sample.csv", index=False)
print("Saved /kaggle/working/preds_sample.csv")


Saved /kaggle/working/preds_sample.csv


In [13]:
# ĐÁNH GIÁ NHANH: Top-K có trúng bao nhiêu mã thật trên 100 ca?
import numpy as np, pandas as pd
from pathlib import Path

K = 5                     # Top-K cần kiểm tra
N_EVAL = 100              # số ca để đánh giá nhanh
BATCH = 256               # batch suy luận để tiết kiệm RAM
OUT_CSV = Path("/kaggle/working/eval_topk_sample.csv")

# lấy 100 ca ngẫu nhiên (hoặc ít hơn nếu test nhỏ)
eval_df = test.sample(min(N_EVAL, len(test)), random_state=SEED).reset_index(drop=True)

codes_all = mlb.classes_
hits_per_case = []
rows_out = []

for i in range(0, len(eval_df), BATCH):
    j = min(i + BATCH, len(eval_df))
    Xb = X_from_text(eval_df.loc[i:j-1, "text_clean"])
    Pb = model.predict_proba(Xb)                 # (batch, n_labels)
    topk_idx = np.argsort(-Pb, axis=1)[:, :K]    # chỉ số nhãn top-K
    for r in range(j - i):
        gold = set(eval_df.at[i+r, "labels"])    # nhãn thật (list -> set)
        pred_idx = topk_idx[r].tolist()
        pred_codes = [codes_all[t] for t in pred_idx]
        # đếm số mã trùng giữa Top-K và gold
        hit = len(gold.intersection(pred_codes))
        hits_per_case.append(hit)
        rows_out.append({
            "subject_id": eval_df.at[i+r, "subject_id"],
            "hadm_id": eval_df.at[i+r, "hadm_id"],
            "hits@{}".format(K): hit,
            "gold_codes": ";".join(sorted(gold)),
            "pred_top{}".format(K): ";".join(pred_codes),
            # (tuỳ chọn) kèm xác suất cho dễ soi
            "pred_top{}_probs".format(K): ";".join([f"{float(Pb[r, t]):.3f}" for t in pred_idx])
        })

# TỔNG KẾT
hits_arr = np.array(hits_per_case)
hit_rate = (hits_arr > 0).mean()          # % ca có ít nhất 1 mã đúng trong Top-K
avg_hits = hits_arr.mean()                 # trung bình số mã đúng trong Top-K
print({
    "K": K,
    "n_cases": len(eval_df),
    "hit_rate@K (>=1 đúng)": round(float(hit_rate), 4),
    "avg_hits@K (trung bình số mã đúng)": round(float(avg_hits), 4),
    "cases_0hit": int((hits_arr==0).sum()),
    "cases_1+": int((hits_arr>0).sum()),
})

# LƯU chi tiết để xem ngoài
df_out = pd.DataFrame(rows_out)
df_out.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)

# IN 5 ví dụ: 3 ca có trúng & 2 ca trượt để bạn quan sát
ok_idx = np.where(hits_arr>0)[0][:3].tolist()
ko_idx = np.where(hits_arr==0)[0][:2].tolist()
show_idx = ok_idx + ko_idx
print("\n=== Ví dụ nhanh ===")
for idx in show_idx:
    row = df_out.iloc[idx]
    print(f"- hadm_id={row['hadm_id']}  hits@{K}={row[f'hits@{K}']}")
    print(f"  gold: {row['gold_codes']}")
    print(f"  pred: {row[f'pred_top{K}']}")
    print(f"  prob: {row[f'pred_top{K}_probs']}")


{'K': 5, 'n_cases': 15, 'hit_rate@K (>=1 đúng)': 0.2667, 'avg_hits@K (trung bình số mã đúng)': 0.6, 'cases_0hit': 11, 'cases_1+': 4}
Saved: /kaggle/working/eval_topk_sample.csv

=== Ví dụ nhanh ===
- hadm_id=24345926  hits@5=2
  gold: 10-C787;10-E039;10-E8339;10-E876;10-F329;10-F419;10-G893;10-K219;10-K3184;10-M1990;10-R110;10-Z170;10-Z7982;10-Z853;10-Z8673;10-Z934
  pred: 10-Z7902;10-I480;10-E8342;10-K219;10-E8339
  prob: 1.000;0.999;0.999;0.996;0.995
- hadm_id=23709687  hits@5=4
  gold: 10-B3781;10-D62;10-E039;10-E440;10-E785;10-E8342;10-E8351;10-F17210;10-G43909;10-G4700;10-I10;10-I2510;10-I714;10-I739;10-J449;10-K219;10-K3189;10-K6389;10-K922;10-L89150;10-M62838;10-N179;10-N390;10-R911;10-Z6821;10-Z7902
  pred: 10-Z7902;10-K219;10-N179;10-I480;10-E8342
  prob: 1.000;0.999;0.997;0.995;0.994
- hadm_id=25140310  hits@5=2
  gold: 10-G4733;10-I10;10-I350;10-I480;10-J398;10-J40;10-K219;10-M160;10-M170;10-M479;10-M8580;10-R911;10-Z85828
  pred: 10-E8342;10-I480;10-Z7902;10-K219;10-N179
  