In [1]:
from google.colab import drive

# nếu từng mount trước đó hoặc bị lỗi "mountpoint not empty"
# drive.flush_and_unmount()

drive.mount('/content/drive', force_remount=True)  # chọn đúng tài khoản của bạn khi popup hiện ra


Mounted at /content/drive


In [2]:
from pathlib import Path
# ==== CONFIG ====
BASE = Path("/content/drive/MyDrive")   # đổi nếu cần
HOSP = BASE/"mimiciv/3.1/hosp"
NOTE = BASE/"mimic-iv-note/2.2/note"
OUT_DIR = BASE/"proc"; OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_PARQUET = OUT_DIR/"train_unified.parquet"   # file xuất cuối
OUT_CSV     = OUT_DIR/"train_unified.csv"

MIN_FREQ = 50          # chỉ giữ ICD có tần suất >= MIN_FREQ
LIMIT_HADM = None      # ví dụ: 20000 để chỉ lấy 20k lần nhập viện (None = lấy hết)
SEED = 42              # seed khi cần lấy mẫu ngẫu nhiên
TEXT_FROM_SERVICE_ONLY = True   # CHỈ cắt từ 'Service:' trở đi (không cắt mục khác)
MAX_CHARS = 200_000    # cắt mềm chiều dài ký tự (an toàn RAM/IO)


In [3]:
import pandas as pd, numpy as np, re, random
import pyarrow as pa, pyarrow.parquet as pq

random.seed(SEED)

# 1) Demographics + tuổi lúc nhập viện
adm = pd.read_csv(HOSP/"admissions.csv.gz",
                  usecols=["subject_id","hadm_id","admittime"], parse_dates=["admittime"])
hdr_pat = pd.read_csv(HOSP/"patients.csv.gz", nrows=0).columns
lkp = {h.lower(): h for h in hdr_pat}
pat = pd.read_csv(HOSP/"patients.csv.gz",
                  usecols=[lkp["subject_id"], lkp.get("gender") or lkp.get("sex"),
                           lkp["anchor_age"], lkp["anchor_year"]])
if lkp.get("sex") and any(c.lower()=="sex" for c in pat.columns):
    pat = pat.rename(columns={lkp["sex"]: "gender"})
elif not any(c.lower()=="gender" for c in pat.columns):
    pat["gender"] = "U"

adm_pat = adm.merge(pat, on="subject_id", how="left")
adm_pat["age_at_admit"] = (
    adm_pat["anchor_age"] + (adm_pat["admittime"].dt.year - adm_pat["anchor_year"])
).clip(lower=0, upper=120)
dem = adm_pat[["subject_id","hadm_id","gender","age_at_admit"]]

# 2) Chọn nhãn giữ lại theo tần suất
icd_counts = pd.read_csv(OUT_DIR/"icd_hadm_freq.csv")
keep_codes = set(icd_counts.loc[icd_counts["hadm_freq"]>=MIN_FREQ, "icd_full"].astype(str))

# 3) hadm_id -> list icd_full (stream)
hadm2codes = {}
for ch in pd.read_csv(HOSP/"diagnoses_icd.csv.gz",
                      usecols=["hadm_id","icd_code","icd_version"],
                      chunksize=200_000):
    ch = ch.drop_duplicates(["hadm_id","icd_code","icd_version"])
    ch["icd_full"] = ch["icd_version"].astype(str)+"-"+ch["icd_code"].astype(str)
    ch = ch[ch["icd_full"].isin(keep_codes)]
    for h, codes in ch.groupby("hadm_id")["icd_full"]:
        hadm2codes.setdefault(int(h), []).extend(codes.tolist())

all_hadm = list(hadm2codes.keys())
if LIMIT_HADM is not None and len(all_hadm) > LIMIT_HADM:
    all_hadm = random.sample(all_hadm, LIMIT_HADM)
keep_hadm = set(all_hadm)

# 4) Chuẩn bị đọc notes + writer
note_path = NOTE/("discharge.csv" if (NOTE/"discharge.csv").exists() else "discharge.csv.gz")
hdr = pd.read_csv(note_path, nrows=0).columns
lk = {h.lower(): h for h in hdr}
text_col = lk.get("text") or lk.get("note_text")
assert text_col, f"Không thấy cột text/note_text trong {note_path}"

def keep_from_service(t:str)->str:
    if not isinstance(t,str): return ""
    m = re.search(r'\bservice\s*:', t, flags=re.I)
    out = t[m.start():] if (m is not None) else t
    return out[:MAX_CHARS]

# writer parquet (fallback CSV nếu cần)
use_parquet, writer = True, None
if OUT_PARQUET.exists(): OUT_PARQUET.unlink()
if OUT_CSV.exists(): OUT_CSV.unlink()

def write_batch(df):
    global writer, use_parquet
    if df.empty: return
    if use_parquet:
        try:
            tbl = pa.Table.from_pandas(df)
            if writer is None:
                writer = pq.ParquetWriter(OUT_PARQUET, tbl.schema, compression="snappy")
            writer.write_table(tbl)
            return
        except Exception:
            use_parquet = False
    df.to_csv(OUT_CSV, mode="a", header=not OUT_CSV.exists(), index=False)

# 5) Stream notes -> join demo -> cắt Service -> ghi
batch, BATCH_ROWS = [], 10_000
for chunk in pd.read_csv(note_path, usecols=["subject_id","hadm_id", text_col],
                         chunksize=100_000, low_memory=True):
    chunk = chunk[chunk["hadm_id"].map(lambda x: int(x) in keep_hadm)]
    if chunk.empty: continue

    chunk = chunk.merge(dem, on=["subject_id","hadm_id"], how="left")
    txt = chunk[text_col].astype(str)
    mask = ~(txt.isna() | txt.str.lower().isin(["nan","none",""]))
    if not mask.any(): continue

    chunk = chunk.loc[mask, ["subject_id","hadm_id","gender","age_at_admit", text_col]].copy()
    if TEXT_FROM_SERVICE_ONLY:
        chunk["text_clean"] = txt.loc[mask].map(keep_from_service)
    else:
        chunk["text_clean"] = txt.loc[mask].str.slice(0, MAX_CHARS)

    # gắn nhãn (chỉ hadm có nhãn keep)
    chunk["icd_codes"] = chunk["hadm_id"].map(lambda h: ";".join(sorted(set(hadm2codes.get(int(h), [])))))
    out = chunk[(chunk["text_clean"].str.len()>0) & (chunk["icd_codes"].str.len()>0)]
    out = out[["subject_id","hadm_id","gender","age_at_admit","text_clean","icd_codes"]]

    batch.append(out)
    if sum(len(b) for b in batch) >= BATCH_ROWS:
        write_batch(pd.concat(batch, ignore_index=True)); batch.clear()

if batch: write_batch(pd.concat(batch, ignore_index=True))
if writer: writer.close()

print("DONE →", OUT_PARQUET if use_parquet and OUT_PARQUET.exists() else OUT_CSV)


DONE → /content/drive/MyDrive/proc/train_unified.parquet


In [1]:
import pandas as pd
from pathlib import Path

# Điều chỉnh nếu cần
try: BASE
except NameError:
    BASE = Path("/content/drive/MyDrive")
PROC = BASE/"proc"
PQT = PROC/"train_unified.parquet"
CSV = PROC/"train_unified.csv"

if PQT.exists():
    df = pd.read_parquet(PQT)
elif CSV.exists():
    df = pd.read_csv(CSV)
else:
    raise FileNotFoundError("Chưa thấy train_unified.{parquet|csv} trong proc/")

print("Shape:", df.shape)
print(df.columns.tolist())
display(df.sample(5, random_state=42)[["subject_id","hadm_id","gender","age_at_admit","icd_codes","text_clean"]])


Shape: (331062, 6)
['subject_id', 'hadm_id', 'gender', 'age_at_admit', 'text_clean', 'icd_codes']


Unnamed: 0,subject_id,hadm_id,gender,age_at_admit,icd_codes,text_clean
71599,12188716,25535697,F,47,9-2410;9-44321;9-V1582;9-V5861,Service: NEUROLOGY\n \nAllergies: \nNo Known A...
159543,14826102,20296734,F,20,9-6262;9-7840;9-78701;9-78903,Service: MEDICINE\n \nAllergies: \nPenicillins...
308536,19319976,26826685,F,60,9-25083;9-2724;9-2761;9-3310;9-3331;9-4019;9-4...,Service: MEDICINE\n \nAllergies: \nSulfa (Sulf...
69959,12139397,26002726,F,85,10-C221;10-C7889;10-D62;10-D72829;10-E869;10-F...,Service: SURGERY\n \nAllergies: \nNo Known All...
237602,17193228,28782684,M,46,9-0389;9-25051;9-25081;9-2720;9-4019;9-5180;9-...,Service: UROLOGY\n \nAllergies: \nNo Known All...
