# 07 · Appendix · 数据字典与缺失情况 📚

自动汇总 `columns_overview.csv` 与 `missingness_overview.csv`，形成附录表格。

In [1]:

# %% [bootstrap]
import sys, subprocess, importlib
for imp, spec in {"yaml":"pyyaml==6.0.2","openpyxl":"openpyxl==3.1.5"}.items():
    try: importlib.import_module(imp)
    except Exception:
        subprocess.run([sys.executable,"-m","pip","install",spec,"-q","--disable-pip-version-check","--no-input"], check=True)
import yaml, pandas as pd
from pathlib import Path
from IPython.display import display


In [2]:

# %% [load]
PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
CONF = yaml.safe_load((PROJECT_ROOT/"conf/config.yaml").read_text(encoding="utf-8"))

DATA_DIR = PROJECT_ROOT / "data_processed"
OUT_DIR  = PROJECT_ROOT / "outputs" / "tables"
OUT_DIR.mkdir(parents=True, exist_ok=True)

p_cols = DATA_DIR / "columns_overview.csv"
p_miss = DATA_DIR / "missingness_overview.csv"


In [3]:

# %% [merge & export]
df_cols = pd.read_csv(p_cols) if p_cols.exists() else None
df_miss = pd.read_csv(p_miss) if p_miss.exists() else None

if df_cols is not None:
    display(df_cols.head(10))
else:
    print("[warn] columns_overview.csv not found")

if df_miss is not None:
    display(df_miss.head(10))
else:
    print("[warn] missingness_overview.csv not found")

if df_cols is not None and df_miss is not None:
    key = "column" if "column" in df_miss.columns else "feature"
    merged = df_cols.merge(df_miss, left_on="column", right_on=key, how="left")
    merged.to_csv(OUT_DIR / "data_dictionary.csv", index=False)
    print("[ok] exported:", OUT_DIR / "data_dictionary.csv")
    display(merged.head(20))
else:
    print("[skip] not enough inputs to merge.")


Unnamed: 0,column,dtype,n_unique,n_missing,pct_missing,example
0,aptt,float64,330,176,11.062225,25.6
1,inr,float64,39,158,9.930861,1.5
2,pt,float64,204,158,9.930861,16.7
3,calcium,float64,64,68,4.274041,10.1
4,temperature,float64,107,35,2.199874,36.72
5,glucose,float64,215,20,1.257071,124.0
6,wbc,float64,282,19,1.194217,23.7
7,sodium,float64,43,18,1.131364,135.0
8,aniongap,float64,27,17,1.06851,12.0
9,bicarbonate,float64,38,17,1.06851,20.0


Unnamed: 0,column,dtype,n_missing,pct_missing
0,aptt,float64,176,11.062225
1,inr,float64,158,9.930861
2,pt,float64,158,9.930861
3,calcium,float64,68,4.274041
4,temperature,float64,35,2.199874
5,glucose,float64,20,1.257071
6,wbc,float64,19,1.194217
7,sodium,float64,18,1.131364
8,chloride,float64,17,1.06851
9,aniongap,float64,17,1.06851


[ok] exported: /public/home/aojiang/海南医科大学/icu-lymphoma-ml-repro/outputs/tables/data_dictionary.csv


Unnamed: 0,column,dtype_x,n_unique,n_missing_x,pct_missing_x,example,dtype_y,n_missing_y,pct_missing_y
0,aptt,float64,330,176,11.062225,25.6,float64,176.0,11.062225
1,inr,float64,39,158,9.930861,1.5,float64,158.0,9.930861
2,pt,float64,204,158,9.930861,16.7,float64,158.0,9.930861
3,calcium,float64,64,68,4.274041,10.1,float64,68.0,4.274041
4,temperature,float64,107,35,2.199874,36.72,float64,35.0,2.199874
5,glucose,float64,215,20,1.257071,124.0,float64,20.0,1.257071
6,wbc,float64,282,19,1.194217,23.7,float64,19.0,1.194217
7,sodium,float64,43,18,1.131364,135.0,float64,18.0,1.131364
8,aniongap,float64,27,17,1.06851,12.0,float64,17.0,1.06851
9,bicarbonate,float64,38,17,1.06851,20.0,float64,17.0,1.06851
