In [1]:
# -*- coding: utf-8 -*-
# =========================================================
# 강서구 펌프장 내수위 예측 - 머신러닝 베이스라인 (한글패치 포함, 원클릭 실행)
# - Files:
#   /mnt/data/1.csv
#   /mnt/data/hydro_raw_2017_2025.csv
#   /mnt/data/rain_2017_2024_merged.csv
#   /mnt/data/sewer_2017_2024_merged.csv
# - Output:
#   figs/perm_importance.png
#   figs/model_importance.png (지원 시)
#   figs/shap_summary_dot.png, figs/shap_summary_bar.png (설치 시)
# =========================================================
import os, re, platform, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# ------------------------------
# 0) 한글 폰트 자동 패치
# ------------------------------
def patch_korean_font():
    sysname = platform.system().lower()
    font_candidates = []
    if "windows" in sysname:
        font_candidates = ["Malgun Gothic", "맑은 고딕"]
    elif "darwin" in sysname:  # macOS
        font_candidates = ["AppleGothic"]
    else:
        font_candidates = ["NanumGothic", "NanumBarunGothic", "DejaVu Sans"]
    for f in font_candidates:
        try:
            matplotlib.rcParams["font.family"] = f
            # 테스트 렌더 (폰트 미설치면 오류 발생 가능)
            _ = plt.figure(); plt.plot([0,1],[0,1]); plt.title("폰트테스트: 한글 OK?"); plt.close()
            print(f"[INFO] 폰트 적용: {f}")
            break
        except Exception:
            continue
    # 마이너스 깨짐 방지
    matplotlib.rcParams["axes.unicode_minus"] = False

os.makedirs("figs", exist_ok=True)
patch_korean_font()

# ------------------------------
# 1) 파일 경로 & 파라미터
# ------------------------------
PUMP_FILE   = "1.csv"
HYDRO_FILE  = "hydro_raw_2017_2025.csv"
RAIN_FILE   = "rain_2017_2024_merged.csv"
SEWER_FILE  = "sewer_2017_2024_merged.csv"

TARGET = "내수위"            # ← '외수위'로 교체 가능
USE_SEWER = True             # 하수관로도 asof로 결합
RAIN_TOL   = "1H"            # 강우 asof 허용
HYDRO_TOL  = "1H"            # 수문 asof 허용
SEWER_TOL  = "30min"         # 하수관로 asof 허용

# ------------------------------
# 2) 데이터 로드
# ------------------------------
pump = pd.read_csv(PUMP_FILE)
# timestamp 확보
if "timestamp" in pump.columns:
    pump["timestamp"] = pd.to_datetime(pump["timestamp"], errors="coerce")
elif {"일자","시각"} <= set(pump.columns):
    # 보수적 병합: 일자/시각 중 하나라도 파싱되면 사용
    ts_a = pd.to_datetime(pump["일자"], errors="coerce")
    ts_b = pd.to_datetime(pump["시각"], errors="coerce")
    pump["timestamp"] = ts_a.combine_first(ts_b)
else:
    raise KeyError("펌프 데이터에 'timestamp' 또는 ['일자','시각']가 필요합니다.")

pump = pump.sort_values("timestamp").reset_index(drop=True)

# 강우
rain = pd.read_csv(RAIN_FILE)
if "timestamp" not in rain.columns:
    if "일시" in rain.columns:
        rain["timestamp"] = pd.to_datetime(rain["일시"], errors="coerce")
    else:
        raise KeyError("강우 데이터에 'timestamp' 또는 '일시'가 필요합니다.")
rain["timestamp"] = pd.to_datetime(rain["timestamp"], errors="coerce")
if "강수량(mm)" not in rain.columns:
    raise KeyError("강우 데이터에 '강수량(mm)' 컬럼이 필요합니다.")
rain = rain[["timestamp","강수량(mm)"]].dropna(subset=["timestamp"]).sort_values("timestamp")
rain.rename(columns={"강수량(mm)":"rain_mm"}, inplace=True)
rain["rain_mm"] = pd.to_numeric(rain["rain_mm"], errors="coerce")

# 수문 (wide→long)
hydro = pd.read_csv(HYDRO_FILE)
if "관측 일시" not in hydro.columns:
    raise KeyError("수문 데이터에 '관측 일시' 컬럼이 필요합니다.")
date_col = "관측 일시"
hour_cols = [c for c in hydro.columns if re.fullmatch(r"\s*\d{1,2}시\s*", str(c))]
if not hour_cols:
    raise KeyError("수문 데이터에서 시간 컬럼(예: '01시'~'24시')을 찾지 못했습니다.")
id_vars = [date_col]
for c in ["station", "year"]:
    if c in hydro.columns:
        id_vars.append(c)
hydro_long = hydro.melt(id_vars=id_vars, value_vars=hour_cols,
                        var_name="hour_k", value_name="hydro_value")
hydro_long[date_col] = pd.to_datetime(hydro_long[date_col], errors="coerce").dt.floor("D")
hydro_long["hour_num"] = hydro_long["hour_k"].astype(str).str.extract(r"(\d{1,2})").astype(float)
hydro_long["hour_mod"] = (hydro_long["hour_num"] % 24).astype("Int64")
hydro_long["timestamp"] = hydro_long[date_col] + pd.to_timedelta(hydro_long["hour_mod"].astype(float), unit="h")
hydro_long["hydro_value"] = pd.to_numeric(hydro_long["hydro_value"], errors="coerce")
hydro_agg = (hydro_long.dropna(subset=["timestamp"])
                      .groupby("timestamp", as_index=False)["hydro_value"]
                      .mean().sort_values("timestamp"))

# 하수관로(옵션)
sewer_agg = None
if USE_SEWER:
    sewer = pd.read_csv(SEWER_FILE)
    if {"timestamp","water_level"} <= set(sewer.columns):
        sewer["timestamp"] = pd.to_datetime(sewer["timestamp"], errors="coerce")
        sewer = sewer.sort_values("timestamp")
        sewer_agg = sewer.groupby("timestamp", as_index=False)["water_level"].mean()
    else:
        print("[WARN] sewer 스키마가 예상과 다릅니다. sewer 결합을 건너뜁니다.")
        USE_SEWER = False

# ------------------------------
# 3) 안전한 asof merge 유틸
# ------------------------------
def merge_asof_(left, right, on_left="timestamp", on_right="timestamp",
                tol="1H", direction="backward"):
    if right is None or len(right)==0:
        return left
    L = left.sort_values(on_left).copy()
    R = right.sort_values(on_right).copy()
    L = pd.merge_asof(L, R, left_on=on_left, right_on=on_right,
                      direction=direction, tolerance=pd.Timedelta(tol))
    if on_right in L.columns and on_right != on_left:
        L = L.drop(columns=[on_right])
    return L

# ------------------------------
# 4) 시계열 병합
# ------------------------------
df = pump.copy().sort_values("timestamp")
df = merge_asof_(df, rain.rename(columns={"timestamp":"ts_rain"}),
                 on_left="timestamp", on_right="ts_rain", tol=RAIN_TOL)
df = merge_asof_(df, hydro_agg, tol=HYDRO_TOL)
if USE_SEWER:
    df = merge_asof_(df, sewer_agg.rename(columns={"timestamp":"ts_sewer"}),
                     on_left="timestamp", on_right="ts_sewer", tol=SEWER_TOL)

# ------------------------------
# 5) 결측 보정(간단하고 튼튼하게)
# ------------------------------
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[num_cols] = df[num_cols].ffill().bfill()
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

# ------------------------------
# 6) 그룹별 시계열 파생 (펌프장명 단위; 누설 방지)
# ------------------------------
def add_time_features(df, group_col="펌프장명",
                      base_cols=("rain_mm","hydro_value","water_level","내수위","외수위")):
    df = df.copy()
    if group_col in df.columns:
        df = df.sort_values([group_col, "timestamp"])
        g = df.groupby(group_col, group_keys=False)
        for col in base_cols:
            if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
                df[f"{col}_lag1"]  = g[col].shift(1)
                df[f"{col}_lag2"]  = g[col].shift(2)
                df[f"{col}_diff1"] = g[col].diff()
                rm3 = g[col].rolling(3).mean().reset_index(level=0, drop=True)
                rm6 = g[col].rolling(6).mean().reset_index(level=0, drop=True)
                df[f"{col}_rm3"] = rm3
                df[f"{col}_rm6"] = rm6
    else:
        df = df.sort_values("timestamp")
        for col in base_cols:
            if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
                df[f"{col}_lag1"]  = df[col].shift(1)
                df[f"{col}_lag2"]  = df[col].shift(2)
                df[f"{col}_diff1"] = df[col].diff()
                df[f"{col}_rm3"]   = df[col].rolling(3).mean()
                df[f"{col}_rm6"]   = df[col].rolling(6).mean()
    return df

df = add_time_features(df)

# 파생으로 생긴 NaN 보정
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

# ------------------------------
# 7) 학습 데이터 구성
# ------------------------------
drop_cols = ["timestamp","일자","시각","펌프장 주소","위도","경도","데이터기준일자", TARGET]
feature_cols = [c for c in df.columns if c not in drop_cols and df[c].dtype != "O"]
X = df[feature_cols].copy()
y = df[TARGET].copy()

mask = X.notna().all(axis=1) & y.notna()
X = X[mask]; y = y[mask]
print(f"[INFO] 학습 데이터: X={X.shape}, y={y.shape}, features={len(feature_cols)}")

# ------------------------------
# 8) 모델 학습/검증 (HGB, 5Fold OOF)
# ------------------------------
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_pred, oof_true = [], []

for tr_idx, va_idx in kf.split(X):
    est = HistGradientBoostingRegressor(
        max_depth=6, learning_rate=0.08, max_iter=500,
        l2_regularization=0.0, random_state=42
    )
    est.fit(X.iloc[tr_idx], y.iloc[tr_idx])
    pred = est.predict(X.iloc[va_idx])
    oof_pred.append(pred); oof_true.append(y.iloc[va_idx].values)

oof_pred = np.concatenate(oof_pred)
oof_true = np.concatenate(oof_true)

# sklearn 구버전 호환: RMSE = sqrt(MSE)
mse = mean_squared_error(oof_true, oof_pred)
rmse = np.sqrt(mse)
r2   = r2_score(oof_true, oof_pred)
print(f"[OOF] RMSE={rmse:.4f}  R²={r2:.4f}")

# 전체 적합 모델(해석용)
est_final = HistGradientBoostingRegressor(
    max_depth=6, learning_rate=0.08, max_iter=500,
    l2_regularization=0.0, random_state=42
).fit(X, y)

# ------------------------------
# 9) 중요도: Permutation / Model-based
# ------------------------------
from sklearn.inspection import permutation_importance

perm = permutation_importance(est_final, X, y, n_repeats=8, random_state=42, n_jobs=-1)
imp_perm = (pd.DataFrame({"feature": feature_cols,
                          "importance": perm.importances_mean,
                          "std": perm.importances_std})
            .sort_values("importance", ascending=False).reset_index(drop=True))
print("\n[Permutation Importance - Top 20]")
print(imp_perm.head(20).to_string(index=False))

# Plot 저장
topk = 20
imp_plot = imp_perm.head(topk).iloc[::-1]
plt.figure(figsize=(9, max(4, topk*0.4)))
plt.barh(imp_plot["feature"], imp_plot["importance"])
plt.xlabel("Permutation Importance (mean decrease)")
plt.ylabel("Feature"); plt.title("Top-20 Permutation Importance (HGB)")
plt.tight_layout(); plt.savefig("figs/perm_importance.png", dpi=200); plt.close()

# Model-based (HGB 지원 시)
if hasattr(est_final, "feature_importances_"):
    imp_model = (pd.DataFrame({"feature": feature_cols,
                               "importance": est_final.feature_importances_})
                 .sort_values("importance", ascending=False))
    print("\n[Model-based Importance - Top 20]")
    print(imp_model.head(20).to_string(index=False))

    imp_plot2 = imp_model.head(topk).iloc[::-1]
    plt.figure(figsize=(9, max(4, topk*0.4)))
    plt.barh(imp_plot2["feature"], imp_plot2["importance"])
    plt.xlabel("Model-based Importance"); plt.ylabel("Feature")
    plt.title("Top-20 Model-based Importance (HGB)")
    plt.tight_layout(); plt.savefig("figs/model_importance.png", dpi=200); plt.close()

# ------------------------------
# 10) (옵션) SHAP 요약 플롯
# ------------------------------
try:
    import shap
    # 일부 버전 경고 회피 플래그(필요 시)
    try:
        shap.utils._legacy = True
    except Exception:
        pass

    nsample = min(2000, len(X))
    rng = np.random.default_rng(42)
    idx = rng.choice(len(X), size=nsample, replace=False)
    Xs = X.iloc[idx].copy()

    explainer = shap.Explainer(est_final, Xs)
    sh = explainer(Xs)

    # summary dot
    plt.figure()
    shap.summary_plot(sh, Xs, plot_type="dot", show=False, max_display=20)
    plt.title("SHAP Summary (dot)")
    plt.tight_layout(); plt.savefig("figs/shap_summary_dot.png", dpi=200); plt.close()

    # summary bar
    plt.figure()
    shap.summary_plot(sh, Xs, plot_type="bar", show=False, max_display=20)
    plt.title("SHAP Summary (bar)")
    plt.tight_layout(); plt.savefig("figs/shap_summary_bar.png", dpi=200); plt.close()

    print("\n[INFO] SHAP 요약 플롯 저장 완료: figs/shap_summary_*.png")

except Exception as e:
    print("\n[INFO] SHAP 해석은 건너뜁니다 → 사유:", repr(e))
    print("      필요 시: pip install shap  후 재실행하세요.")

print("\n[DONE] 모든 작업 완료. 플롯은 figs/ 폴더를 확인하세요.")


[INFO] 폰트 적용: Malgun Gothic
[INFO] 학습 데이터: X=(64367, 28), y=(64367,), features=28
[OOF] RMSE=0.0605  R²=0.9979

[Permutation Importance - Top 20]
         feature  importance      std
         내수위_rm3    2.474746 0.013235
        내수위_lag2    0.185599 0.000524
        내수위_lag1    0.067800 0.000339
       내수위_diff1    0.009666 0.000071
        유역면적(ha)    0.001857 0.000026
             외수위    0.001679 0.000071
         내수위_rm6    0.001145 0.000009
     hydro_value    0.001119 0.000019
     rain_mm_rm3    0.000980 0.000007
         rain_mm    0.000946 0.000008
       외수위_diff1    0.000287 0.000005
         외수위_rm3    0.000196 0.000007
 hydro_value_rm3    0.000178 0.000004
     rain_mm_rm6    0.000105 0.000005
 hydro_value_rm6    0.000102 0.000004
hydro_value_lag1    0.000097 0.000002
         외수위_rm6    0.000085 0.000002
        외수위_lag2    0.000084 0.000004
        외수위_lag1    0.000084 0.000006
hydro_value_lag2    0.000080 0.000001

[INFO] SHAP 해석은 건너뜁니다 → 사유: ExplainerError('Additivity 

In [3]:
# -*- coding: utf-8 -*-
# =========================================================
# 강서구 펌프장 내수위 예측 - 머신러닝 베이스라인 (한글패치 포함, 원클릭 실행)
# - Files:
#   /mnt/data/1.csv
#   /mnt/data/hydro_raw_2017_2025.csv
#   /mnt/data/rain_2017_2024_merged.csv
#   /mnt/data/sewer_2017_2024_merged.csv
# - Output:
#   figs/perm_importance.png
#   figs/model_importance.png (지원 시)
#   figs/shap_summary_dot.png, figs/shap_summary_bar.png (설치 시)
# =========================================================
import os, re, platform, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# ------------------------------
# 0) 한글 폰트 자동 패치
# ------------------------------
def patch_korean_font():
    sysname = platform.system().lower()
    font_candidates = []
    if "windows" in sysname:
        font_candidates = ["Malgun Gothic", "맑은 고딕"]
    elif "darwin" in sysname:  # macOS
        font_candidates = ["AppleGothic"]
    else:
        font_candidates = ["NanumGothic", "NanumBarunGothic", "DejaVu Sans"]
    for f in font_candidates:
        try:
            matplotlib.rcParams["font.family"] = f
            # 테스트 렌더 (폰트 미설치면 오류 발생 가능)
            _ = plt.figure(); plt.plot([0,1],[0,1]); plt.title("폰트테스트: 한글 OK?"); plt.close()
            print(f"[INFO] 폰트 적용: {f}")
            break
        except Exception:
            continue
    # 마이너스 깨짐 방지
    matplotlib.rcParams["axes.unicode_minus"] = False

os.makedirs("figs", exist_ok=True)
patch_korean_font()

# ------------------------------
# 1) 파일 경로 & 파라미터
# ------------------------------
PUMP_FILE   = "1.csv"
HYDRO_FILE  = "hydro_raw_2017_2025.csv"
RAIN_FILE   = "rain_2017_2024_merged.csv"
SEWER_FILE  = "sewer_2017_2024_merged.csv"

TARGET = "내수위"            # ← '외수위'로 교체 가능
USE_SEWER = True             # 하수관로도 asof로 결합
RAIN_TOL   = "1H"            # 강우 asof 허용
HYDRO_TOL  = "1H"            # 수문 asof 허용
SEWER_TOL  = "30min"         # 하수관로 asof 허용

# ------------------------------
# 2) 데이터 로드
# ------------------------------
pump = pd.read_csv(PUMP_FILE)
# timestamp 확보
if "timestamp" in pump.columns:
    pump["timestamp"] = pd.to_datetime(pump["timestamp"], errors="coerce")
elif {"일자","시각"} <= set(pump.columns):
    # 보수적 병합: 일자/시각 중 하나라도 파싱되면 사용
    ts_a = pd.to_datetime(pump["일자"], errors="coerce")
    ts_b = pd.to_datetime(pump["시각"], errors="coerce")
    pump["timestamp"] = ts_a.combine_first(ts_b)
else:
    raise KeyError("펌프 데이터에 'timestamp' 또는 ['일자','시각']가 필요합니다.")

pump = pump.sort_values("timestamp").reset_index(drop=True)

# 강우
rain = pd.read_csv(RAIN_FILE)
if "timestamp" not in rain.columns:
    if "일시" in rain.columns:
        rain["timestamp"] = pd.to_datetime(rain["일시"], errors="coerce")
    else:
        raise KeyError("강우 데이터에 'timestamp' 또는 '일시'가 필요합니다.")
rain["timestamp"] = pd.to_datetime(rain["timestamp"], errors="coerce")
if "강수량(mm)" not in rain.columns:
    raise KeyError("강우 데이터에 '강수량(mm)' 컬럼이 필요합니다.")
rain = rain[["timestamp","강수량(mm)"]].dropna(subset=["timestamp"]).sort_values("timestamp")
rain.rename(columns={"강수량(mm)":"rain_mm"}, inplace=True)
rain["rain_mm"] = pd.to_numeric(rain["rain_mm"], errors="coerce")

# 수문 (wide→long)
hydro = pd.read_csv(HYDRO_FILE)
if "관측 일시" not in hydro.columns:
    raise KeyError("수문 데이터에 '관측 일시' 컬럼이 필요합니다.")
date_col = "관측 일시"
hour_cols = [c for c in hydro.columns if re.fullmatch(r"\s*\d{1,2}시\s*", str(c))]
if not hour_cols:
    raise KeyError("수문 데이터에서 시간 컬럼(예: '01시'~'24시')을 찾지 못했습니다.")
id_vars = [date_col]
for c in ["station", "year"]:
    if c in hydro.columns:
        id_vars.append(c)
hydro_long = hydro.melt(id_vars=id_vars, value_vars=hour_cols,
                        var_name="hour_k", value_name="hydro_value")
hydro_long[date_col] = pd.to_datetime(hydro_long[date_col], errors="coerce").dt.floor("D")
hydro_long["hour_num"] = hydro_long["hour_k"].astype(str).str.extract(r"(\d{1,2})").astype(float)
hydro_long["hour_mod"] = (hydro_long["hour_num"] % 24).astype("Int64")
hydro_long["timestamp"] = hydro_long[date_col] + pd.to_timedelta(hydro_long["hour_mod"].astype(float), unit="h")
hydro_long["hydro_value"] = pd.to_numeric(hydro_long["hydro_value"], errors="coerce")
hydro_agg = (hydro_long.dropna(subset=["timestamp"])
                      .groupby("timestamp", as_index=False)["hydro_value"]
                      .mean().sort_values("timestamp"))

# 하수관로(옵션)
sewer_agg = None
if USE_SEWER:
    sewer = pd.read_csv(SEWER_FILE)
    if {"timestamp","water_level"} <= set(sewer.columns):
        sewer["timestamp"] = pd.to_datetime(sewer["timestamp"], errors="coerce")
        sewer = sewer.sort_values("timestamp")
        sewer_agg = sewer.groupby("timestamp", as_index=False)["water_level"].mean()
    else:
        print("[WARN] sewer 스키마가 예상과 다릅니다. sewer 결합을 건너뜁니다.")
        USE_SEWER = False

# ------------------------------
# 2) 후보 모델 정의
# ------------------------------
models = {}
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
models["HGB"] = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.08, max_iter=500, random_state=42)
models["GBR"] = GradientBoostingRegressor(random_state=42)

try:
    from xgboost import XGBRegressor
    models["XGB"] = XGBRegressor(
        n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
        reg_lambda=1.0, random_state=42, n_jobs=-1, tree_method="hist"
    )
except: pass

try:
    import lightgbm as lgb
    models["LGBM"] = lgb.LGBMRegressor(
        n_estimators=1200, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1
    )
except: pass

try:
    from catboost import CatBoostRegressor
    models["CAT"] = CatBoostRegressor(
        iterations=1200, depth=6, learning_rate=0.05,
        loss_function="RMSE", random_seed=42, verbose=False
    )
except: pass

# ------------------------------
# 3) KFold OOF + 개별 시각화
# ------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
leaderboard = []
oof_preds_by_model = {}

for name, est in models.items():
    oof_pred, oof_true = [], []
    for tr_idx, va_idx in kf.split(X):
        Xt, Xv, yt, yv = X.iloc[tr_idx], X.iloc[va_idx], y.iloc[tr_idx], y.iloc[va_idx]
        est_ = est.__class__(**getattr(est, "get_params", lambda: {})())
        est_.fit(Xt, yt)
        pv = est_.predict(Xv)
        oof_pred.append(pv); oof_true.append(yv.values)
    oof_pred, oof_true = np.concatenate(oof_pred), np.concatenate(oof_true)

    mse = mean_squared_error(oof_true, oof_pred)
    rmse, r2 = math.sqrt(mse), r2_score(oof_true, oof_pred)
    leaderboard.append({"model": name, "RMSE": rmse, "R2": r2})
    oof_preds_by_model[name] = (oof_true, oof_pred)

    # 산점도 + 회귀선
    plt.figure(figsize=(6,6))
    plt.scatter(oof_true, oof_pred, s=8, alpha=0.6)
    lims_min, lims_max = min(oof_true.min(), oof_pred.min()), max(oof_true.max(), oof_pred.max())
    plt.plot([lims_min, lims_max],[lims_min, lims_max])
    X_mat = np.c_[np.ones_like(oof_true), oof_true]
    beta, *_ = np.linalg.lstsq(X_mat, oof_pred, rcond=None)
    line_x = np.linspace(lims_min, lims_max, 100)
    plt.plot(line_x, beta[0]+beta[1]*line_x)
    plt.xlabel("실측값"); plt.ylabel("예측값")
    plt.title(f"{name} 회귀선\nRMSE={rmse:.3f}, R²={r2:.3f}")
    plt.tight_layout(); plt.savefig(f"figs/scatter_regline_{name}.png", dpi=200); plt.close()

leader_df = pd.DataFrame(leaderboard).sort_values("RMSE").reset_index(drop=True)
print("\n[리더보드]")
print(leader_df)

# ------------------------------
# 4) 상위 2개 모델 앙상블
# ------------------------------
if len(leader_df) >= 2:
    top2 = leader_df.head(2)["model"].tolist()
    print(f"\n[INFO] 상위 2개 모델 앙상블: {top2}")
    oof_true = oof_preds_by_model[top2[0]][0]
    pred1 = oof_preds_by_model[top2[0]][1]
    pred2 = oof_preds_by_model[top2[1]][1]

    rmse1 = leader_df.iloc[0]["RMSE"]
    rmse2 = leader_df.iloc[1]["RMSE"]
    w1, w2 = 1/rmse1, 1/rmse2
    ens_pred = (w1*pred1 + w2*pred2)/(w1+w2)

    mse = mean_squared_error(oof_true, ens_pred)
    rmse, r2 = math.sqrt(mse), r2_score(oof_true, ens_pred)
    print(f"[앙상블] RMSE={rmse:.4f}, R²={r2:.4f}")

    plt.figure(figsize=(6,6))
    plt.scatter(oof_true, ens_pred, s=8, alpha=0.6)
    lims_min, lims_max = min(oof_true.min(), ens_pred.min()), max(oof_true.max(), ens_pred.max())
    plt.plot([lims_min, lims_max],[lims_min, lims_max])
    X_mat = np.c_[np.ones_like(oof_true), oof_true]
    beta, *_ = np.linalg.lstsq(X_mat, ens_pred, rcond=None)
    line_x = np.linspace(lims_min, lims_max, 100)
    plt.plot(line_x, beta[0]+beta[1]*line_x)
    plt.xlabel("실측값"); plt.ylabel("예측값")
    plt.title(f"앙상블({'+'.join(top2)}) 회귀선\nRMSE={rmse:.3f}, R²={r2:.3f}")
    plt.tight_layout(); plt.savefig("figs/scatter_regline_Ensemble.png", dpi=200); plt.close()


[INFO] 폰트 적용: Malgun Gothic
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5061
[LightGBM] [Info] Number of data points in the train set: 51493, number of used features: 28
[LightGBM] [Info] Start training from score 2.373905
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5063
[LightGBM] [Info] Number of data points in the train set: 51493, number of used features: 28
[LightGBM] [Info] Start training from score 2.374053
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5064
[LightGBM] [Info] Number of data points in the train set: 51494, number of used features: 28