In [None]:
# 1) Load model bundle
import joblib, numpy as np, pandas as pd
bundle = joblib.load("../models/ovr_sgd_tfidf.joblib")
clf = bundle["clf"]; word_vec = bundle["word_vec"]; char_vec = bundle["char_vec"]
mlb = bundle["mlb"]; cfg = bundle["cfg"]
MAX_TOKENS = cfg.get("MAX_TOKENS_PER_DOC", 8000)

def _truncate(s, mx=MAX_TOKENS): return " ".join(str(s).split()[:mx])

def _to_X(texts):
    s = pd.Series(texts).map(_truncate)
    Xw = word_vec.transform(s)
    if char_vec is not None:
        from scipy.sparse import hstack
        Xc = char_vec.transform(s)
        return hstack([Xw, Xc], format="csr")
    return Xw

def predict_topk(texts, K=5):
    P = clf.predict_proba(_to_X(texts))
    codes = mlb.classes_; out = []
    for i in range(len(texts)):
        idx = np.argsort(-P[i])[:K]
        out.append([(codes[j], float(P[i,j])) for j in idx])
    return out

# 2) Demo nhanh
samples = [
    "Service: MEDICINE\nHistory: chest pain, HTN, DM, hyperlipidemia...",
    "Service: SURGERY\nPost-op day #2, fever, wound infection, antibiotics..."
]
for i, preds in enumerate(predict_topk(samples, K=5), 1):
    print(f"\nCase {i}:")
    for c,p in preds: print(f"  {c}: {p:.3f}")


In [None]:
df = pd.read_parquet("../data/proc/train_unified.parquet").head(50)  # ví dụ
K = 5
rows = []
for _, r in df.iterrows():
    topk = predict_topk([r["text_clean"]], K=K)[0]
    rows.append({
        "hadm_id": r["hadm_id"],
        "pred_topK": ";".join([f"{c}:{p:.3f}" for c,p in topk])
    })
pd.DataFrame(rows).to_csv("models/preds_sample_local.csv", index=False)
print("Saved models/preds_sample_local.csv")


In [None]:
import joblib, numpy as np, pandas as pd
from pathlib import Path

# ========= Load model =========
bundle = joblib.load("../models/ovr_sgd_tfidf.joblib")
clf = bundle["clf"]; word_vec = bundle["word_vec"]; char_vec = bundle["char_vec"]
mlb = bundle["mlb"]; cfg = bundle["cfg"]
MAX_TOKENS = cfg.get("MAX_TOKENS_PER_DOC", 8000)

def _truncate(s, mx=MAX_TOKENS): return " ".join(str(s).split()[:mx])
def _to_X(texts):
    s = pd.Series(texts).map(_truncate)
    Xw = word_vec.transform(s)
    if char_vec is not None:
        from scipy.sparse import hstack
        Xc = char_vec.transform(s)
        return hstack([Xw, Xc], format="csr")
    return Xw

def predict_topk(texts, K=5):
    P = clf.predict_proba(_to_X(texts))
    codes = mlb.classes_; out = []
    for i in range(len(texts)):
        idx = np.argsort(-P[i])[:K]
        out.append([(codes[j], float(P[i,j])) for j in idx])
    return out

# ========= Load bảng tên ICD =========
PATH_D_ICD = "../data/mimiciv/3.1/hosp/d_icd_diagnoses.csv.gz"  # đúng thư mục hosp
d = pd.read_csv(PATH_D_ICD, compression="gzip", usecols=["icd_code","icd_version","long_title"])
title_map = {(int(v), c.strip()): lt for c, v, lt in zip(d.icd_code, d.icd_version, d.long_title)}

def icd_name_from_prefixed(code_with_prefix: str) -> str:
    # "9-4019" -> (9,"4019"), "10-I10" -> (10,"I10")
    try:
        ver_str, code = code_with_prefix.split("-", 1)
        return title_map.get((int(ver_str), code), "(unknown title)")
    except Exception:
        return "(unknown title)"

# ========= Lấy 10 mẫu và in kết quả (kèm tên bệnh) =========
df = pd.read_parquet("../data/proc/train_unified.parquet").sample(10, random_state=42)
for _, r in df.iterrows():
    preds = predict_topk([r["text_clean"]], K=5)[0]
    print(f"\nHADM {r['hadm_id']} | subj {r['subject_id']}")
    for c, p in preds:
        print(f"  {c}: {p:.3f}  —  {icd_name_from_prefixed(c)}")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



HADM 25535697 | subj 12188716
  9-4019: 0.329
  9-V5861: 0.099
  9-2724: 0.096
  9-V1582: 0.064
  9-7840: 0.058

HADM 20296734 | subj 14826102
  9-2859: 0.129
  9-78701: 0.111
  9-78903: 0.106
  9-78900: 0.104
  9-49390: 0.084

HADM 26826685 | subj 19319976
  9-V5867: 0.619
  9-4019: 0.457
  9-2761: 0.438
  9-29680: 0.386
  9-25000: 0.338

HADM 26002726 | subj 12139397
  10-D62: 0.190
  10-Z87891: 0.148
  10-I10: 0.112
  10-Y92239: 0.085
  10-N179: 0.084

HADM 28782684 | subj 17193228
  9-4019: 0.518
  9-53081: 0.403
  9-2724: 0.258
  9-99592: 0.217
  9-V5867: 0.184

HADM 26588327 | subj 17355488
  9-V5811: 0.818
  9-20500: 0.686
  9-30000: 0.595
  9-53081: 0.560
  9-V160: 0.399

HADM 29494996 | subj 13555204
  10-I10: 0.614
  10-Z87891: 0.276
  10-E785: 0.186
  10-K219: 0.181
  10-F17210: 0.111

HADM 24605015 | subj 14312872
  10-Z5111: 0.863
  10-Z888: 0.329
  10-T451X5A: 0.276
  10-F419: 0.159
  10-Z87891: 0.154

HADM 21787279 | subj 12107161
  9-3051: 0.277
  9-4019: 0.166
  9-E84