In [1]:
import pandas as pd

# 1. Load patient list
patient_df = pd.read_excel("CRE_4399list.xlsx")

# 2. Load lab data
lab_df = pd.read_excel("CRE_812311labonlydata.xlsx", skiprows=1)

# 3. Rename columns
lab_df.columns = ["환자번호", "환자명", "성별", "생년월일", "검사시행일", "검사명", "검사결과",
                  "BMI", "기록당시나이", "수축기혈압", "이완기혈압"]

# 4. Define test map
test_map = {
    "Total Cholesterol": "tc",
    "LDL": "ldl",
    "HDL": "hdl",
    "Triglyceride": "tg",
    "Neutrophil": "neut",
    "Lymphocyte": "lymph",
    "eGFR": "egfr",
    "CRP": "crp"
}

pattern = "|".join(test_map.keys())
lab_df = lab_df[lab_df["검사명"].str.contains(pattern, case=False, na=False)].copy()

# 5. 정규화된 검사명 부여
for key, val in test_map.items():
    lab_df.loc[lab_df["검사명"].str.contains(key, case=False, na=False), "검사명정리"] = val

# 6. 날짜/숫자 변환
lab_df["검사시행일"] = pd.to_datetime(lab_df["검사시행일"], errors="coerce")
lab_df["검사결과"] = pd.to_numeric(lab_df["검사결과"], errors="coerce")

# 7. median 기준 조건 필터링 + 최신 검사 선택
filtered_results = []

for test in lab_df["검사명정리"].unique():
    df_sub = lab_df[lab_df["검사명정리"] == test].copy()
    median_vals = df_sub.groupby("환자번호")["검사결과"].median()

    for pid, group in df_sub.groupby("환자번호"):
        if pid not in median_vals:
            continue
        median = median_vals[pid]
        # 선택 기준
        if test in ["tc", "ldl", "neut", "hdl", "crp", "tg"]:
            filtered = group[group["검사결과"] > median]
        elif test in ["lymph", "egfr"]:
            filtered = group[group["검사결과"] < median]
        else:
            continue

        if not filtered.empty:
            most_recent = filtered.sort_values("검사시행일", ascending=False).iloc[0]
            filtered_results.append({
                "환자번호": pid,
                test: most_recent["검사결과"]
            })

# 8. Pivot 형태로 구성
filtered_df = pd.DataFrame(filtered_results)
pivot_df = filtered_df.groupby("환자번호").first().reset_index()

# 9. Vital 정보 (BMI, 나이, 혈압)
vitals = lab_df.dropna(subset=["BMI", "기록당시나이", "수축기혈압", "이완기혈압"])
vitals = vitals.sort_values(["환자번호", "검사시행일"])
vitals = vitals.drop_duplicates("환자번호", keep="last")
vitals = vitals[["환자번호", "BMI", "기록당시나이", "수축기혈압", "이완기혈압"]]

# 10. Merge all
merged = patient_df.merge(vitals, on="환자번호", how="left")
merged = merged.merge(pivot_df, on="환자번호", how="left")

# 11. Save
merged.to_excel("merged_patient_data_selected_by_median_updated.xlsx", index=False)
print("✅ 병합 완료: 'merged_patient_data_selected_by_median_updated.xlsx' 저장됨")


✅ 병합 완료: 'merged_patient_data_selected_by_median_updated.xlsx' 저장됨
