# 01. LLM Output Preprocessing

Converts LLM chunk-level predictions to patient-level binary features (16 categories),
then merges with EHR data.

In [None]:
import sys
sys.path.insert(0, "..")

import pandas as pd
from pathlib import Path
from src.config import PROJECT_ROOT
from src.variables import CATEGORY_MAP

## 1. LLM chunk → patient-level binary

In [None]:
# ── File paths ──
in_path = PROJECT_ROOT / "LLM/LLM_FE/data/LLM_output/specific_feature_chunks_predictions_checkpoint500.csv"
out_path = in_path.with_name("LLM_patient_level_16cat_binary.csv")

df = pd.read_csv(in_path)
df = df.rename(columns={"id": "환자번호"})

# Korean category → English mapping
df["category_en"] = df["category"].map(CATEGORY_MAP)

# 있음/없음 → binary (null → 0)
df["binary"] = df["label"].map({"있음": 1, "없음": 0}).fillna(0).astype(int)

# Patient × category aggregation (any positive → 1)
patient_cat = df.groupby(["환자번호", "category_en"])["binary"].max().reset_index()

# Wide format
wide = (
    patient_cat
    .pivot(index="환자번호", columns="category_en", values="binary")
    .fillna(0).astype(int).reset_index()
)

wide.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved:", out_path)
print("Shape:", wide.shape)

## 2. Merge LLM features with EHR data

In [None]:
# ── File paths ──
ehr_path = PROJECT_ROOT / "data/raw/ADER_windowday_dataset_number.csv"
llm_path = PROJECT_ROOT / "LLM/LLM_FE/data/LLM_output/LLM_patient_level_16cat_binary.csv"
out_path = PROJECT_ROOT / "data/raw/ADER_windowday_dataset_number_v2.csv"

ehr_df = pd.read_csv(ehr_path)
llm_df = pd.read_csv(llm_path)

assert "환자번호" in ehr_df.columns, "'환자번호' not found in EHR dataset"
assert "환자번호" in llm_df.columns, "'환자번호' not found in LLM dataset"

# Left join
merged_df = ehr_df.merge(llm_df, on="환자번호", how="left")

merged_df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Merge completed")
print("Saved:", out_path)
print("Shape:", merged_df.shape)