In [3]:

# LightGBM


# -*- coding: utf-8 -*-
# LightGBM 전용: Cheonan EMD-level population modeling (완전 실행형)
# - shp(.shx 누락 자동복구), POI(학교/버스/병원), 주민등록 인구(읍면동, 2020~2024) 연도 프레임
# - YouthIndex/SeniorCareIndex 합성지표 포함
# - GroupKFold(연도 홀드아웃) 평가, 중요도/OOF/최종모델 저장

from pathlib import Path
import warnings, re, os
warnings.filterwarnings("ignore")

import pandas as pd, numpy as np, geopandas as gpd
from shapely.geometry import Point
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.inspection import permutation_importance
from lightgbm import LGBMRegressor
import joblib

# ================== 경로/상수 ==================
BASE = Path(r"C:\공모전\2025천안")

OUT  = BASE / "outputs" / "LightGBM"; OUT.mkdir(parents=True, exist_ok=True)
SHP_PATH = BASE / "천안시경계(24).shp"   # .shx 없어도 자동 복구 시도
POP_DIR  = BASE / "주민등록인구"
POP_FILE = BASE / "시군구별 이동자" / "202001_202412_주민등록인구및세대현황_월간.csv"
POP_INPUT_IS_FILE = True# 보유 중인 읍면동 인구 표 폴더
SCHOOL_CSV   = BASE / "초중등학교 데이터" / "전국초중등학교위치표준데이터.csv"
BUS_CSV      = BASE / "버스정류장 위치정보" / "국토교통부_전국 버스정류장 위치정보.csv"
HOSPITAL_CSV = BASE / "병원정보서비스" / "병원정보서비스.csv"

YEARS = [2020, 2021, 2022, 2023, 2024]
DEFAULT_PROJ = "EPSG:5179"  # Korea 2000 / Unified CS

TARGET_CANDIDATES = ["고령화율","aging_rate","65세이상비율","총인구","population","total_pop"]

# shp 읽을 때 .shx 자동 복구 허용
os.environ["SHAPE_RESTORE_SHX"] = "YES"

# 이름/코드 컬럼 후보(경계/인구 모두 대응)
EMD_NAME_CANDS = [
    "EMD_NM","EMD_KOR_NM","EMD_NAME","EMD_NM_KR","EMD_NM_KOR","EMD_NM_KO",
    "읍면동","읍면동명","행정동","행정동명","법정동","법정동명","adm_nm","adm_dr_nm","NAME","name","동리명"
]
EMD_CODE_CANDS = [
    "EMD_CD","EMDCD","ADM_CD","adm_cd","법정동코드","행정동코드","법정리코드","법정동코드(10자리)","HCODE","CODE"
]
POP_NAME_CANDS = ["읍면동","읍면동명","행정동","행정동명","법정동","법정동명","adm_nm","EMD_NM","동리명","NAME","name"]
POP_CODE_CANDS = ["행정동코드","법정동코드","EMD_CD","adm_cd","CODE","HCODE"]

# ================== 유틸 함수 ==================
def pick_col(cols, cands):
    """정확일치 → 공백/대소문자 무시 → 부분포함 순으로 첫 컬럼 반환"""
    for c in cands:
        if c in cols: return c
    norm = {re.sub(r"\s+","",c).lower(): c for c in cols}
    for cand in cands:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    for c in cols:
        for cand in cands:
            if re.sub(r"\s+","",cand).lower() in re.sub(r"\s+","",c).lower():
                return c
    return None

def standardize_emd_name(s):
    if pd.isna(s): return ""
    s = str(s).strip()
    s = re.sub(r"\s+","", s)
    # 필요하면 접두부(천안시동남구/서북구) 제거 등 커스텀 룰 추가 가능
    return s

def normalize_adm_code(series, zfill=10):
    """숫자만 남겨 10자리(법정동코드 관용) zero-fill"""
    return series.astype(str).str.replace(r"\D","", regex=True).str.zfill(zfill)

def guess_emd_name_col(gdf: gpd.GeoDataFrame):
    """한글+동/읍/면/리로 끝나는 비율 높은 문자열 컬럼 자동 후보"""
    cand=[]
    for c in gdf.columns:
        if gdf[c].dtype == "object":
            s = gdf[c].astype(str).head(200)
            ratio = s.str.contains(r"(동|읍|면|리)$").mean()
            if ratio >= 0.2:
                cand.append((ratio, c))
    if cand:
        cand.sort(reverse=True)
        return cand[0][1]
    return None

def read_shp_with_restore(shp_path: str, default_crs=DEFAULT_PROJ):
    import fiona
    with fiona.Env(SHAPE_RESTORE_SHX="YES"):
        g = gpd.read_file(shp_path)
    if g.crs is None:
        g = g.set_crs(default_crs, allow_override=True)
    return g

def resolve_emd_keys_from_shp(shp_path: str) -> gpd.GeoDataFrame:
    """경계에서 이름/코드 표준 키 생성 + 면적 km² 추가"""
    g = read_shp_with_restore(shp_path)
    code_col = pick_col(g.columns, EMD_CODE_CANDS)
    name_col = pick_col(g.columns, EMD_NAME_CANDS) or guess_emd_name_col(g)
    if code_col is not None:
        g["ADM_CODE_STD"] = normalize_adm_code(g[code_col])
    if name_col is not None:
        g["EMD_NAME_STD"] = g[name_col].astype(str).map(standardize_emd_name)
    if ("ADM_CODE_STD" not in g.columns) and ("EMD_NAME_STD" not in g.columns):
        raise RuntimeError(f"읍면동 키(이름/코드)를 찾지 못했습니다. 컬럼={list(g.columns)}")
    g["area_km2"] = g.geometry.area / 1_000_000.0
    return g

def detect_latlon(df: pd.DataFrame):
    lat_cands = ["위도","lat","latitude","Y","y","WGS84위도","Y좌표","Y좌표(WGS84)"]
    lon_cands = ["경도","lon","longitude","X","x","WGS84경도","X좌표","X좌표(WGS84)"]
    lat = pick_col(df.columns, lat_cands)
    lon = pick_col(df.columns, lon_cands)
    return lat, lon

def to_points(df: pd.DataFrame, lat: str, lon: str, crs="EPSG:4326") -> gpd.GeoDataFrame:
    d = df.dropna(subset=[lat, lon]).copy()
    d[lat] = pd.to_numeric(d[lat], errors="coerce")
    d[lon] = pd.to_numeric(d[lon], errors="coerce")
    d = d.dropna(subset=[lat, lon])
    return gpd.GeoDataFrame(d, geometry=gpd.points_from_xy(d[lon], d[lat]), crs=crs)

def project_like(g: gpd.GeoDataFrame, like: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    if g.crs is None: g = g.set_crs("EPSG:4326", allow_override=True)
    if like.crs is None: like = like.set_crs(DEFAULT_PROJ, allow_override=True)
    if g.crs != like.crs: g = g.to_crs(like.crs)
    return g

def sjoin_counts_and_nearest(points_gdf, emd, prefix):
    """EMD 내 포함 카운트, km²당 밀도, 최근접거리(km) 및 로그보조"""
    emd_cent = emd.copy(); emd_cent["centroid"] = emd.geometry.centroid
    emd_cent = emd_cent.set_geometry("centroid")
    # 최근접 거리
    if len(points_gdf) > 0:
        nn = gpd.sjoin_nearest(emd_cent, points_gdf[["geometry"]], how="left", distance_col=f"{prefix}_nearest_m")
        near = nn[[f"{prefix}_nearest_m"]].reindex(emd.index)
    else:
        near = pd.DataFrame({f"{prefix}_nearest_m":[np.nan]*len(emd)}, index=emd.index)
    # 포함 카운트
    if len(points_gdf) > 0:
        jj = gpd.sjoin(emd, points_gdf[["geometry"]], how="left", predicate="contains")
        counts = jj.groupby(jj.index).size().reindex(emd.index).fillna(0).astype(int)
    else:
        counts = pd.Series(0, index=emd.index, dtype=int)
    out = pd.DataFrame(index=emd.index)
    out[f"{prefix}_count"] = counts
    area = (emd.geometry.area/1_000_000.0).replace(0, np.nan)
    out[f"{prefix}_dens_km2"] = out[f"{prefix}_count"] / area
    out[f"{prefix}_dens_log1p"] = np.log1p(out[f"{prefix}_dens_km2"])
    out = pd.concat([out, near], axis=1)
    out[f"{prefix}_nearest_km"] = out[f"{prefix}_nearest_m"] / 1000.0
    out[f"{prefix}_nearest_log1p"] = np.log1p(out[f"{prefix}_nearest_km"])
    return out

def zscore(s: pd.Series):
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

def read_any_table(p: Path) -> pd.DataFrame:
    if p.suffix.lower() == ".csv":
        for enc in ("utf-8-sig","cp949","euc-kr"):
            try: return pd.read_csv(p, encoding=enc)
            except UnicodeDecodeError: continue
        return pd.read_csv(p)
    else:
        return pd.read_excel(p)

def find_pop_files(pop_dir: Path):
    pats=[r".*인구.*\.(csv|xlsx)$", r".*주민.*\.(csv|xlsx)$", r".*세대.*\.(csv|xlsx)$"]
    files=[]
    for fp in pop_dir.glob("**/*"):
        if fp.is_file() and any(re.match(p, fp.name, flags=re.I) for p in pats):
            files.append(fp)
    return files

def coerce_ym(s):
    if pd.isna(s): return (None,None)
    t=str(s).strip().replace(".","").replace("-","").replace("/","")
    if len(t)>=6 and t[:6].isdigit(): return (int(t[:4]), int(t[4:6]))
    if len(t)==4 and t.isdigit(): return (int(t),12)
    return (None,None)

def resolve_keys_from_pop(df_in: pd.DataFrame):
    """인구 표에서 이름/코드 표준 키 생성(없으면 패턴으로 이름 유추)"""
    df = df_in.copy()
    code_col = pick_col(df.columns, POP_CODE_CANDS)
    name_col = pick_col(df.columns, POP_NAME_CANDS)
    if code_col is not None:
        df["ADM_CODE_STD"] = normalize_adm_code(df[code_col])
    if name_col is not None:
        df["EMD_NAME_STD"] = df[name_col].astype(str).map(standardize_emd_name)
    if ("ADM_CODE_STD" not in df.columns) and ("EMD_NAME_STD" not in df.columns):
        # 값 패턴으로 이름 추정
        for c in df.columns:
            if df[c].dtype == "object":
                s = df[c].astype(str).head(200)
                if s.str.contains(r"(동|읍|면|리)$").mean() >= 0.2:
                    df["EMD_NAME_STD"] = df[c].astype(str).map(standardize_emd_name)
                    break
    return df

# ================== 데이터 로드/피처 구성 ==================
print("[INFO] 1) EMD 경계 읽기 & 키 생성")
emd = resolve_emd_keys_from_shp(str(SHP_PATH))
BOUNDARY_HAS_CODE = "ADM_CODE_STD" in emd.columns
BOUNDARY_HAS_NAME = "EMD_NAME_STD" in emd.columns
idx_col = "EMD_NAME_STD" if BOUNDARY_HAS_NAME else "ADM_CODE_STD"
emd = emd.set_index(idx_col, drop=False)

print("[DEBUG] EMD columns:", list(emd.columns))
print("[DEBUG] has code:", BOUNDARY_HAS_CODE, "has name:", BOUNDARY_HAS_NAME)

print("[INFO] 2) POI 읽기 → 포인트 변환 → 경계 좌표계로 투영")
def read_poi(csv_path: Path, emd):
    if not csv_path.exists():
        print(f"[WARN] POI 파일 없음: {csv_path.name}")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    try: df = pd.read_csv(csv_path, encoding="utf-8-sig")
    except UnicodeDecodeError: df = pd.read_csv(csv_path, encoding="cp949")
    lat, lon = detect_latlon(df)
    if not (lat and lon):
        print(f"[WARN] 위경도 컬럼 자동탐색 실패 → 빈 처리: {csv_path.name}")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    g = to_points(df, lat, lon, crs="EPSG:4326")
    return project_like(g, emd)

g_school = read_poi(SCHOOL_CSV, emd)
g_bus    = read_poi(BUS_CSV, emd)
g_hosp   = read_poi(HOSPITAL_CSV, emd)

print("[INFO] 3) EMD별 정적 피처(카운트/밀도/최근접거리) + 합성지표")
f_school = sjoin_counts_and_nearest(g_school, emd, "school")
f_bus    = sjoin_counts_and_nearest(g_bus, emd, "bus")
f_hosp   = sjoin_counts_and_nearest(g_hosp, emd, "hosp")
feat_static = pd.concat([f_school, f_bus, f_hosp], axis=1)

feat_static["school_z"] = zscore(feat_static["school_dens_km2"].fillna(0))
feat_static["bus_z"]    = zscore(feat_static["bus_dens_km2"].fillna(0))
feat_static["hosp_z"]   = zscore(feat_static["hosp_dens_km2"].fillna(0))

# 합성지표(도메인 priors): YouthIndex & SeniorCareIndex
# - YouthIndex = 0.5*학교 + 0.3*버스 + 0.2*병원
# - SeniorCareIndex = 0.6*병원 + 0.3*버스 + 0.1*학교
feat_static["YouthIndex"]      = 0.5*feat_static["school_z"] + 0.3*feat_static["bus_z"] + 0.2*feat_static["hosp_z"]
feat_static["SeniorCareIndex"] = 0.6*feat_static["hosp_z"]  + 0.3*feat_static["bus_z"] + 0.1*feat_static["school_z"]
feat_static["area_km2"] = emd["area_km2"]

print("[INFO] 4) 인구 표 읽기/연도화 (월 자료면 최신월, 없으면 연 스냅샷)")
def build_population(pop_dir: Path, years: list):
    files = find_pop_files(pop_dir)
    if not files:
        print(f"[WARN] 인구 파일이 없습니다: {pop_dir}")
        return pd.DataFrame()
    frames=[]
    for p in files:
        df = read_any_table(p)
        df = resolve_keys_from_pop(df)
        # 연/월 추출
        ym_col = pick_col(df.columns, ["기준년월","년월","연월","yyyymm","기준일자","date","집계년월"])
        year_col = pick_col(df.columns, ["연도","년도","year","Year"])
        month_col= pick_col(df.columns, ["월","month","Month"])
        if ym_col:
            ym=df[ym_col].map(coerce_ym); df["__year__"]=[y for y,_ in ym]; df["__month__"]=[m for _,m in ym]
        elif year_col:
            df["__year__"]=pd.to_numeric(df[year_col], errors="coerce")
            df["__month__"]=pd.to_numeric(df[month_col], errors="coerce") if month_col else 12
        else:
            m=re.search(r"(20\d{2})(0[1-9]|1[0-2])", p.stem)
            if m: df["__year__"]=int(m.group(1)); df["__month__"]=int(m.group(2))
            else:
                m2=re.search(r"(20\d{2})", p.stem)
                df["__year__"]=int(m2.group(1)) if m2 else np.nan; df["__month__"]=12

        # 타깃 후보 준비
        total = pick_col(df.columns, ["총인구","인구","총인구수","population","total_pop"])
        aged  = pick_col(df.columns, ["65세이상","65세이상인구","고령인구","over65"])
        aging = pick_col(df.columns, ["고령화율","aging_rate","65세이상비율","ratio_65"])
        keep = ["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"] + [c for c in [total, aged, aging] if c]
        df2 = df[keep].copy()
        frames.append(df2)

    pop = pd.concat(frames, ignore_index=True)
    pop = pop.dropna(subset=["__year__"])
    pop = pop[pop["__year__"].isin(years)]
    # 월 자료면 최신월(연말/가장 최근) 선택
    pop = pop.sort_values(["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"])
    pop = pop.groupby(["ADM_CODE_STD","EMD_NAME_STD","__year__"], as_index=False).tail(1)

    # 고령화율 계산(없으면)
    if "aging_rate" not in pop.columns:
        tcol = pick_col(pop.columns, ["총인구","인구","총인구수","population","total_pop"])
        acol = pick_col(pop.columns, ["65세이상","65세이상인구","고령인구","over65"])
        if tcol and acol:
            pop["aging_rate"] = pd.to_numeric(pop[acol], errors="coerce")/pd.to_numeric(pop[tcol], errors="coerce")

    pop = pop.rename(columns={"__year__":"year"})
    return pop

pop_yearly = build_population(POP_DIR, YEARS)
if pop_yearly.empty:
    raise SystemExit("[ERROR] 인구 데이터가 비었습니다. POP_DIR에 파일을 넣고 다시 실행하세요.")

print("[INFO] 5) 연패널 구성(정적 피처 연도별 복제) + 병합")
panel=[]
for y in YEARS:
    tmp = feat_static.copy()
    tmp["year"] = y
    # 병합 키 컬럼(이름/코드) 동시 보유
    if "EMD_NAME_STD" in emd.columns: tmp["EMD_NAME_STD"] = emd["EMD_NAME_STD"]
    if "ADM_CODE_STD" in emd.columns: tmp["ADM_CODE_STD"] = emd["ADM_CODE_STD"]
    panel.append(tmp.reset_index(drop=True))
panel = pd.concat(panel, axis=0)

# 병합 키 결정 (코드 우선, 그다음 이름)
POP_HAS_CODE = "ADM_CODE_STD" in pop_yearly.columns
POP_HAS_NAME = "EMD_NAME_STD" in pop_yearly.columns
BOUNDARY_HAS_CODE = "ADM_CODE_STD" in panel.columns
BOUNDARY_HAS_NAME = "EMD_NAME_STD" in panel.columns

if BOUNDARY_HAS_CODE and POP_HAS_CODE:
    key = "ADM_CODE_STD"
elif BOUNDARY_HAS_NAME and POP_HAS_NAME:
    key = "EMD_NAME_STD"
else:
    key = "EMD_NAME_STD" if POP_HAS_NAME else "ADM_CODE_STD"

data = pop_yearly.merge(panel, on=[key,"year"], how="left")

# ================== 타깃/피처 준비 (X, y, groups) ==================
tgt=None
for c in TARGET_CANDIDATES:
    if c in data.columns:
        tgt=c; break
if tgt is None:
    raise SystemExit("[ERROR] 타깃 컬럼(고령화율/총인구 등)이 데이터에 없습니다.")

feature_cols = [
    # 밀도/거리형
    "school_dens_km2","bus_dens_km2","hosp_dens_km2",
    "school_dens_log1p","bus_dens_log1p","hosp_dens_log1p",
    "school_nearest_km","bus_nearest_km","hosp_nearest_km",
    "school_nearest_log1p","bus_nearest_log1p","hosp_nearest_log1p",
    # 합성지표
    "YouthIndex","SeniorCareIndex",
    # 면적
    "area_km2",
]
feature_cols = [c for c in feature_cols if c in data.columns]
for c in feature_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce").fillna(0)

y = pd.to_numeric(data[tgt], errors="coerce")
mask = ~y.isna()
data = data.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)
X = data[feature_cols].copy()
groups = data["year"].astype(int)

print(f"[INFO] Target={tgt} | X shape={X.shape} | years={sorted(groups.unique())}")
print("[DEBUG] Features:", feature_cols)

# ================== 학습/평가 ==================
def run_cv(model_name, model, X, y, groups):
    gkf=GroupKFold(n_splits=len(YEARS))
    oof=np.full(len(y), np.nan); rows=[]
    for i,(tr,va) in enumerate(gkf.split(X,y,groups),1):
        model.fit(X.iloc[tr], y.iloc[tr])
        pred=model.predict(X.iloc[va])
        oof[va]=pred
        r2=r2_score(y.iloc[va],pred); mae=mean_absolute_error(y.iloc[va],pred); rmse=mean_squared_error(y.iloc[va],pred,squared=False)
        yr=int(groups.iloc[va].iloc[0]); rows.append({"fold":i,"valid_year":yr,"R2":r2,"MAE":mae,"RMSE":rmse})
        print(f"[{model_name}] Fold{i} (valid {yr})  R2={r2:.3f}  MAE={mae:.3f}  RMSE={rmse:.3f}")
    pd.DataFrame(rows).to_csv(OUT/f"cv_{model_name}.csv", index=False, encoding="utf-8-sig")

    # 중요도 저장 (Permutation + 내장 중요도)
    try:
        imp=permutation_importance(model, X, y, n_repeats=10, random_state=42)
        pd.DataFrame({"feature":X.columns,
                      "perm_importance_mean":imp.importances_mean,
                      "perm_importance_std":imp.importances_std})\
          .sort_values("perm_importance_mean", ascending=False)\
          .to_csv(OUT/f"feat_importance_perm_{model_name}.csv", index=False, encoding="utf-8-sig")
    except Exception as e:
        print("[WARN] permutation importance 실패:", e)
    try:
        fi = getattr(model, "feature_importances_", None)
        if fi is not None:
            pd.DataFrame({"feature":X.columns, "lgbm_importance":fi})\
              .sort_values("lgbm_importance", ascending=False)\
              .to_csv(OUT/f"feat_importance_{model_name}.csv", index=False, encoding="utf-8-sig")
    except Exception as e:
        print("[WARN] 내장 중요도 저장 실패:", e)

    # OOF 저장
    oof_df=data[[("EMD_NAME_STD" if "EMD_NAME_STD" in data.columns else None), ("ADM_CODE_STD" if "ADM_CODE_STD" in data.columns else None), "year"]].copy()
    # 위 줄에서 None 컬럼이 섞이지 않도록 정리
    ocols=[c for c in ["EMD_NAME_STD","ADM_CODE_STD","year"] if c in oof_df.columns]
    oof_df=oof_df[ocols]
    oof_df["y_true"]=y.values; oof_df["y_pred"]=oof
    oof_df.to_csv(OUT/f"oof_{model_name}.csv", index=False, encoding="utf-8-sig")

# LightGBM 하이퍼파라미터(합리적 기본값)
lgbm = LGBMRegressor(
    n_estimators=1600,
    learning_rate=0.03,
    num_leaves=63,
    max_depth=-1,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=1e-2,
    reg_lambda=1e-1,
    random_state=42,
    n_jobs=-1
)

run_cv("LightGBM", lgbm, X, y, groups)

# ================== 전체 학습 & 저장 ==================
lgbm.fit(X,y)
joblib.dump(lgbm, OUT/"final_lgbm.pkl")

data_out = data[[c for c in ["EMD_NAME_STD","ADM_CODE_STD","year",tgt]+feature_cols if c in data.columns]].copy()
data_out.to_csv(OUT/"model_input_panel.csv", index=False, encoding="utf-8-sig")

print("[DONE] LightGBM outputs saved →", OUT)



[INFO] 1) EMD 경계 읽기 & 키 생성


RuntimeError: 읍면동 키(이름/코드)를 찾지 못했습니다. 컬럼=['geometry']

In [26]:
# === XGBoost 실행 전에 꼭 넣는 '드롭인 가드' ===
# (전처리에서 'data'를 만든 뒤 실행해야 합니다)
import os, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

# 출력 폴더 기본값
if 'OUT' not in globals():
    OUT = Path(r"C:\공모전\2025천안\outputs\XGBoost")
    OUT.mkdir(parents=True, exist_ok=True)

# 전처리 산출물 'data'가 없으면 중단
if 'data' not in globals():
    raise SystemExit("data DataFrame이 없습니다. (경계/POI/인구) 전처리 블록을 먼저 실행해 'data'를 만든 뒤 다시 실행하세요.")

# 교차검증용 연도 리스트
if 'YEARS' not in globals():
    YEARS = sorted(pd.unique(data['year']).astype(int).tolist())

# 타깃 자동 선택
TARGET_CANDIDATES = ["고령화율","aging_rate","65세이상비율","총인구","population","total_pop"]
if 'tgt' not in globals():
    tgt = next((c for c in TARGET_CANDIDATES if c in data.columns), None)
    if tgt is None:
        raise SystemExit("타깃 컬럼(고령화율/총인구 등)을 data에서 찾지 못했습니다.")

# 피처 목록(없으면 자동 축소)
if 'feature_cols' not in globals():
    base_feats = [
        "school_dens_km2","bus_dens_km2","hosp_dens_km2",
        "school_dens_log1p","bus_dens_log1p","hosp_dens_log1p",
        "school_nearest_km","bus_nearest_km","hosp_nearest_km",
        "school_nearest_log1p","bus_nearest_log1p","hosp_nearest_log1p",
        "YouthIndex","SeniorCareIndex","area_km2"
    ]
    feature_cols = [c for c in base_feats if c in data.columns]

# 결측/형 변환
for c in feature_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce").fillna(0)

y = pd.to_numeric(data[tgt], errors="coerce")
mask = ~y.isna()
data = data.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)


X = data[feature_cols].copy()
groups = data["year"].astype(int)
# === 드롭인 가드 끝 ===


SystemExit: data DataFrame이 없습니다. (경계/POI/인구) 전처리 블록을 먼저 실행해 'data'를 만든 뒤 다시 실행하세요.

In [27]:
#catboost

from pathlib import Path
import os, re, warnings
warnings.filterwarnings("ignore")

# ===== install check =====
try:
    from catboost import CatBoostRegressor
except ImportError:
    raise SystemExit(
        "CatBoost가 설치되어 있지 않습니다. 터미널에서:\n"
        "  pip install catboost\n"
        "설치 후 다시 실행하세요."
    )

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.inspection import permutation_importance
import joblib

# -------------------------------------------------
# 경로/옵션
# -------------------------------------------------
BASE = Path(r"C:\공모전\2025천안")
OUT  = BASE / "outputs" / "CatBoost"
OUT.mkdir(parents=True, exist_ok=True)

SHP_PATH = BASE / "천안시경계(24).shp"   # 또는 .gpkg (아래 read_shp_with_restore가 .shx 복구 시도)
POP_DIR  = BASE / "주민등록인구"          # 보유한 읍면동 인구 파일(월/연/스냅샷) 폴더
SCHOOL_CSV   = BASE / "초중등학교 데이터" / "전국초중등학교위치표준데이터.csv"
BUS_CSV      = BASE / "버스정류장 위치정보" / "국토교통부_전국 버스정류장 위치정보.csv"
HOSPITAL_CSV = BASE / "병원정보서비스" / "병원정보서비스.csv"

YEARS = [2020, 2021, 2022, 2023, 2024]
DEFAULT_PROJ = "EPSG:5179"  # Korea 2000 / Unified CS

# 타깃 후보(우선순위)
TARGET_CANDIDATES = ["고령화율","aging_rate","65세이상비율","총인구","population","total_pop"]

# -------------------------------------------------
# 유틸(컬럼 추정/정규화)
# -------------------------------------------------
os.environ["SHAPE_RESTORE_SHX"] = "YES"  # .shx 누락 자동복구 허용

EMD_NAME_CANDS = [
    "EMD_NM","EMD_KOR_NM","EMD_NAME","EMD_NM_KR","EMD_NM_KOR","EMD_NM_KO",
    "읍면동","읍면동명","행정동","행정동명","법정동","법정동명","adm_nm","adm_dr_nm","NAME","name","동리명"
]
EMD_CODE_CANDS = [
    "EMD_CD","EMDCD","ADM_CD","adm_cd","법정동코드","행정동코드","법정리코드","법정동코드(10자리)","HCODE","CODE"
]
POP_NAME_CANDS = ["읍면동","읍면동명","행정동","행정동명","법정동","법정동명","adm_nm","EMD_NM","동리명","NAME","name"]
POP_CODE_CANDS = ["행정동코드","법정동코드","EMD_CD","adm_cd","CODE","HCODE"]

def pick_col(cols, cands):
    # 정확 일치
    for c in cands:
        if c in cols: return c
    # 공백/대소문자 무시
    norm = {re.sub(r"\s+","",c).lower(): c for c in cols}
    for cand in cands:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    # 부분 포함
    for c in cols:
        for cand in cands:
            if re.sub(r"\s+","",cand).lower() in re.sub(r"\s+","",c).lower():
                return c
    return None

def standardize_emd_name(s):
    if pd.isna(s): return ""
    s = str(s).strip()
    s = re.sub(r"\s+", "", s)
    return s

def normalize_adm_code(series, zfill=10):
    return series.astype(str).str.replace(r"\D", "", regex=True).str.zfill(zfill)

def guess_emd_name_col(gdf: gpd.GeoDataFrame):
    cand = []
    for c in gdf.columns:
        if gdf[c].dtype == "object":
            s = gdf[c].astype(str).head(200)
            ratio = s.str.contains(r"(동|읍|면|리)$").mean()
            if ratio >= 0.2: cand.append((ratio, c))
    if cand:
        cand.sort(reverse=True)
        return cand[0][1]
    return None

def read_shp_with_restore(shp_path: str, default_crs=DEFAULT_PROJ):
    import fiona
    with fiona.Env(SHAPE_RESTORE_SHX="YES"):
        g = gpd.read_file(shp_path)
    if g.crs is None:
        g = g.set_crs(default_crs, allow_override=True)
    return g

def resolve_emd_keys_from_shp(shp_path: str) -> gpd.GeoDataFrame:
    g = read_shp_with_restore(shp_path)
    code_col = pick_col(g.columns, EMD_CODE_CANDS)
    name_col = pick_col(g.columns, EMD_NAME_CANDS) or guess_emd_name_col(g)
    if code_col is not None:
        g["ADM_CODE_STD"] = normalize_adm_code(g[code_col])
    if name_col is not None:
        g["EMD_NAME_STD"] = g[name_col].astype(str).map(standardize_emd_name)
    if ("ADM_CODE_STD" not in g.columns) and ("EMD_NAME_STD" not in g.columns):
        raise RuntimeError(f"읍면동 키(이름/코드)를 찾지 못했습니다. 컬럼={list(g.columns)}")
    g["area_km2"] = g.geometry.area / 1_000_000.0
    return g

def detect_latlon(df: pd.DataFrame):
    lat_cands = ["위도","lat","latitude","Y","y","WGS84위도","Y좌표","Y좌표(WGS84)"]
    lon_cands = ["경도","lon","longitude","X","x","WGS84경도","X좌표","X좌표(WGS84)"]
    lat = pick_col(df.columns, lat_cands)
    lon = pick_col(df.columns, lon_cands)
    return lat, lon

def to_points(df: pd.DataFrame, lat: str, lon: str, crs="EPSG:4326") -> gpd.GeoDataFrame:
    d = df.dropna(subset=[lat, lon]).copy()
    d[lat] = pd.to_numeric(d[lat], errors="coerce")
    d[lon] = pd.to_numeric(d[lon], errors="coerce")
    d = d.dropna(subset=[lat, lon])
    return gpd.GeoDataFrame(d, geometry=gpd.points_from_xy(d[lon], d[lat]), crs=crs)

def project_like(g: gpd.GeoDataFrame, like: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    if g.crs is None: g = g.set_crs("EPSG:4326", allow_override=True)
    if like.crs is None: like = like.set_crs(DEFAULT_PROJ, allow_override=True)
    if g.crs != like.crs: g = g.to_crs(like.crs)
    return g

def sjoin_counts_and_nearest(points_gdf, emd, prefix):
    emd_cent = emd.copy(); emd_cent["centroid"] = emd.geometry.centroid
    emd_cent = emd_cent.set_geometry("centroid")
    # 최근접 거리
    if len(points_gdf) > 0:
        nn = gpd.sjoin_nearest(emd_cent, points_gdf[["geometry"]], how="left", distance_col=f"{prefix}_nearest_m")
        near = nn[[f"{prefix}_nearest_m"]].reindex(emd.index)
    else:
        near = pd.DataFrame({f"{prefix}_nearest_m":[np.nan]*len(emd)}, index=emd.index)
    # 포함 카운트
    if len(points_gdf) > 0:
        jj = gpd.sjoin(emd, points_gdf[["geometry"]], how="left", predicate="contains")
        counts = jj.groupby(jj.index).size().reindex(emd.index).fillna(0).astype(int)
    else:
        counts = pd.Series(0, index=emd.index, dtype=int)
    out = pd.DataFrame(index=emd.index)
    out[f"{prefix}_count"] = counts
    area = (emd.geometry.area/1_000_000.0).replace(0, np.nan)
    out[f"{prefix}_dens_km2"] = out[f"{prefix}_count"] / area
    out[f"{prefix}_dens_log1p"] = np.log1p(out[f"{prefix}_dens_km2"])
    out = pd.concat([out, near], axis=1)
    out[f"{prefix}_nearest_km"] = out[f"{prefix}_nearest_m"] / 1000.0
    out[f"{prefix}_nearest_log1p"] = np.log1p(out[f"{prefix}_nearest_km"])
    return out

def zscore(s: pd.Series):
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

def read_any_table(p: Path) -> pd.DataFrame:
    if p.suffix.lower() == ".csv":
        for enc in ("utf-8-sig","cp949","euc-kr"):
            try: return pd.read_csv(p, encoding=enc)
            except UnicodeDecodeError: continue
        return pd.read_csv(p)
    else:
        return pd.read_excel(p)

def find_pop_files(pop_dir: Path):
    pats=[r".*인구.*\.(csv|xlsx)$", r".*주민.*\.(csv|xlsx)$", r".*세대.*\.(csv|xlsx)$"]
    files=[]
    for fp in pop_dir.glob("**/*"):
        if fp.is_file() and any(re.match(p, fp.name, flags=re.I) for p in pats):
            files.append(fp)
    return files

def coerce_ym(s):
    if pd.isna(s): return (None,None)
    t=str(s).strip().replace(".","").replace("-","").replace("/","")
    if len(t)>=6 and t[:6].isdigit(): return (int(t[:4]), int(t[4:6]))
    if len(t)==4 and t.isdigit(): return (int(t),12)
    return (None,None)

def resolve_keys_from_pop(df_in: pd.DataFrame):
    df = df_in.copy()
    code_col = pick_col(df.columns, POP_CODE_CANDS)
    name_col = pick_col(df.columns, POP_NAME_CANDS)
    if code_col is not None:
        df["ADM_CODE_STD"] = normalize_adm_code(df[code_col])
    if name_col is not None:
        df["EMD_NAME_STD"] = df[name_col].astype(str).map(standardize_emd_name)
    if ("ADM_CODE_STD" not in df.columns) and ("EMD_NAME_STD" not in df.columns):
        # 값 패턴으로 이름 추정
        for c in df.columns:
            if df[c].dtype == "object":
                s = df[c].astype(str).head(200)
                if s.str.contains(r"(동|읍|면|리)$").mean() >= 0.2:
                    df["EMD_NAME_STD"] = df[c].astype(str).map(standardize_emd_name)
                    break
    return df

# -------------------------------------------------
# 데이터 로드 & 피처 생성
# -------------------------------------------------
print("[INFO] EMD 경계 읽기")
emd = resolve_emd_keys_from_shp(str(SHP_PATH))
BOUNDARY_HAS_CODE = "ADM_CODE_STD" in emd.columns
BOUNDARY_HAS_NAME = "EMD_NAME_STD" in emd.columns
idx_col = "EMD_NAME_STD" if BOUNDARY_HAS_NAME else "ADM_CODE_STD"
emd = emd.set_index(idx_col, drop=False)

print("[INFO] POI 읽기")
def read_poi(csv_path: Path, emd):
    if not csv_path.exists():
        print(f"[WARN] POI 파일 없음: {csv_path.name}")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    try: df = pd.read_csv(csv_path, encoding="utf-8-sig")
    except UnicodeDecodeError: df = pd.read_csv(csv_path, encoding="cp949")
    lat,lon = detect_latlon(df)
    if not (lat and lon):
        print(f"[WARN] 위경도 컬럼 자동탐색 실패 → 빈 처리: {csv_path.name}")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    g = to_points(df, lat, lon, crs="EPSG:4326")
    return project_like(g, emd)

g_school = read_poi(SCHOOL_CSV, emd)
g_bus    = read_poi(BUS_CSV, emd)
g_hosp   = read_poi(HOSPITAL_CSV, emd)

print("[INFO] 정적 피처 생성")
f_school = sjoin_counts_and_nearest(g_school, emd, "school")
f_bus    = sjoin_counts_and_nearest(g_bus, emd, "bus")
f_hosp   = sjoin_counts_and_nearest(g_hosp, emd, "hosp")
feat_static = pd.concat([f_school, f_bus, f_hosp], axis=1)
feat_static["school_z"] = zscore(feat_static["school_dens_km2"].fillna(0))
feat_static["bus_z"]    = zscore(feat_static["bus_dens_km2"].fillna(0))
feat_static["hosp_z"]   = zscore(feat_static["hosp_dens_km2"].fillna(0))
# 합성 가중치 (도메인 priors)
feat_static["YouthIndex"]      = 0.5*feat_static["school_z"] + 0.3*feat_static["bus_z"] + 0.2*feat_static["hosp_z"]
feat_static["SeniorCareIndex"] = 0.6*feat_static["hosp_z"]  + 0.3*feat_static["bus_z"] + 0.1*feat_static["school_z"]
feat_static["area_km2"] = emd["area_km2"]

print("[INFO] 인구 표 읽기/연도화")
def build_population(pop_dir: Path, years: list):
    files = find_pop_files(pop_dir)
    if not files:
        print(f"[WARN] 인구 파일이 없습니다: {pop_dir}")
        return pd.DataFrame()
    frames=[]
    for p in files:
        df = read_any_table(p)
        df = resolve_keys_from_pop(df)
        # 연/월 추출
        ym_col = pick_col(df.columns, ["기준년월","년월","연월","yyyymm","기준일자","date","집계년월"])
        year_col = pick_col(df.columns, ["연도","년도","year","Year"])
        month_col= pick_col(df.columns, ["월","month","Month"])
        if ym_col:
            ym=df[ym_col].map(coerce_ym); df["__year__"]=[y for y,_ in ym]; df["__month__"]=[m for _,m in ym]
        elif year_col:
            df["__year__"]=pd.to_numeric(df[year_col], errors="coerce")
            df["__month__"]=pd.to_numeric(df[month_col], errors="coerce") if month_col else 12
        else:
            m=re.search(r"(20\d{2})(0[1-9]|1[0-2])", p.stem)
            if m: df["__year__"]=int(m.group(1)); df["__month__"]=int(m.group(2))
            else:
                m2=re.search(r"(20\d{2})", p.stem)
                df["__year__"]=int(m2.group(1)) if m2 else np.nan; df["__month__"]=12

        # 타깃 후보 준비
        total = pick_col(df.columns, ["총인구","인구","총인구수","population","total_pop"])
        aged  = pick_col(df.columns, ["65세이상","65세이상인구","고령인구","over65"])
        aging = pick_col(df.columns, ["고령화율","aging_rate","65세이상비율","ratio_65"])
        keep = ["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"] + [c for c in [total, aged, aging] if c]
        df2 = df[keep].copy()
        frames.append(df2)

    pop = pd.concat(frames, ignore_index=True)
    pop = pop.dropna(subset=["__year__"])
    pop = pop[pop["__year__"].isin(years)]
    # 월 자료면 최신월(연말/가장 최근) 선택
    pop = pop.sort_values(["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"])
    pop = pop.groupby(["ADM_CODE_STD","EMD_NAME_STD","__year__"], as_index=False).tail(1)

    # 고령화율 계산(없으면)
    if "aging_rate" not in pop.columns:
        tcol = pick_col(pop.columns, ["총인구","인구","총인구수","population","total_pop"])
        acol = pick_col(pop.columns, ["65세이상","65세이상인구","고령인구","over65"])
        if tcol and acol:
            pop["aging_rate"] = pd.to_numeric(pop[acol], errors="coerce")/pd.to_numeric(pop[tcol], errors="coerce")
    pop = pop.rename(columns={"__year__":"year"})
    return pop

pop_yearly = build_population(POP_DIR, YEARS)
if pop_yearly.empty:
    raise SystemExit("[ERROR] 인구 데이터가 비었습니다. POP_DIR에 파일을 넣고 다시 실행하세요.")

# 연패널(정적 피처는 연도별 복제)
panel = []
for y in YEARS:
    tmp = feat_static.copy()
    tmp["year"] = y
    # 병합 키 컬럼 추가
    if "EMD_NAME_STD" in emd.columns:
        tmp["EMD_NAME_STD"] = emd["EMD_NAME_STD"]
    if "ADM_CODE_STD" in emd.columns:
        tmp["ADM_CODE_STD"] = emd["ADM_CODE_STD"]
    panel.append(tmp.reset_index(drop=True))
panel = pd.concat(panel, axis=0)

# 병합 키 결정(코드 우선)
POP_HAS_CODE = "ADM_CODE_STD" in pop_yearly.columns
POP_HAS_NAME = "EMD_NAME_STD" in pop_yearly.columns
BOUNDARY_HAS_CODE = "ADM_CODE_STD" in panel.columns
BOUNDARY_HAS_NAME = "EMD_NAME_STD" in panel.columns

if BOUNDARY_HAS_CODE and POP_HAS_CODE:
    key = "ADM_CODE_STD"
elif BOUNDARY_HAS_NAME and POP_HAS_NAME:
    key = "EMD_NAME_STD"
else:
    # 둘 다 있지 않으면 이름 기준 시도
    key = "EMD_NAME_STD" if POP_HAS_NAME else "ADM_CODE_STD"

data = pop_yearly.merge(panel, on=[key,"year"], how="left")

# -----------------------------
# 타깃 및 피처 준비  (여기가 X, y, groups 생성부!)
# -----------------------------
tgt = None
for cand in TARGET_CANDIDATES:
    if cand in data.columns:
        tgt = cand; break
if tgt is None:
    raise SystemExit("[ERROR] 타깃 컬럼(고령화율/총인구 등)이 데이터에 없습니다.")

feature_cols = [
    # 밀도/거리형
    "school_dens_km2","bus_dens_km2","hosp_dens_km2",
    "school_dens_log1p","bus_dens_log1p","hosp_dens_log1p",
    "school_nearest_km","bus_nearest_km","hosp_nearest_km",
    "school_nearest_log1p","bus_nearest_log1p","hosp_nearest_log1p",
    # 합성지표
    "YouthIndex","SeniorCareIndex",
    # 면적
    "area_km2",
]
feature_cols = [c for c in feature_cols if c in data.columns]
for c in feature_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce").fillna(0)

y = pd.to_numeric(data[tgt], errors="coerce")
mask = ~y.isna()
data = data.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)
X = data[feature_cols].copy()        # ←← 바로 이 줄이 없으면 'X is not defined'가 납니다!
groups = data["year"].astype(int)

print(f"[INFO] Target={tgt}, X shape={X.shape}, years={sorted(groups.unique())}")

# -------------------------------------------------
# 학습/검증
# -------------------------------------------------
def run_cv_catboost(X, y, groups):
    gkf = GroupKFold(n_splits=len(YEARS))
    oof = np.full(len(y), np.nan)
    rows = []

    for fold, (tr, va) in enumerate(gkf.split(X, y, groups=groups), 1):
        model = CatBoostRegressor(
            iterations=2200,
            learning_rate=0.03,
            depth=8,
            loss_function="RMSE",
            random_state=42,
            verbose=False
        )
        model.fit(X.iloc[tr], y.iloc[tr], verbose=False)
        pred = model.predict(X.iloc[va])
        oof[va] = pred

        r2   = r2_score(y.iloc[va], pred)
        mae  = mean_absolute_error(y.iloc[va], pred)
        rmse = mean_squared_error(y.iloc[va], pred, squared=False)
        yr   = int(groups.iloc[va].iloc[0])
        rows.append({"fold":fold,"valid_year":yr,"R2":r2,"MAE":mae,"RMSE":rmse})
        print(f"[CatBoost] Fold{fold} (valid {yr}) R2={r2:.3f} MAE={mae:.3f} RMSE={rmse:.3f}")

    cv_df = pd.DataFrame(rows)
    cv_df.to_csv(OUT/"cv_CatBoost.csv", index=False, encoding="utf-8-sig")

    # 중요도 저장 (Permutation + 내장 중요도)
    try:
        # 마지막 학습 모델로 대체 측정 (간단화)
        model_final = CatBoostRegressor(
            iterations=2200, learning_rate=0.03, depth=8, loss_function="RMSE",
            random_state=42, verbose=False
        )
        model_final.fit(X, y, verbose=False)
        imp_perm = permutation_importance(model_final, X, y, n_repeats=10, random_state=42)
        pd.DataFrame({
            "feature": X.columns,
            "perm_importance_mean": imp_perm.importances_mean,
            "perm_importance_std": imp_perm.importances_std
        }).sort_values("perm_importance_mean", ascending=False)\
          .to_csv(OUT/"feat_importance_perm_CatBoost.csv", index=False, encoding="utf-8-sig")

        imp_cb = pd.Series(model_final.get_feature_importance(), index=X.columns).sort_values(ascending=False)
        imp_cb.reset_index().rename(columns={"index":"feature",0:"catboost_importance"})\
              .to_csv(OUT/"feat_importance_cb_CatBoost.csv", index=False, encoding="utf-8-sig")

        # 최종 모델 저장
        joblib.dump(model_final, OUT/"final_catboost.pkl")
    except Exception as e:
        print("[WARN] 중요도/최종모델 저장 중 오류:", e)

    # OOF 저장
    oof_df = data[["EMD_NAME_STD","ADM_CODE_STD","year"]].copy()
    oof_df["y_true"] = y.values
    oof_df["y_pred"] = oof
    oof_df.to_csv(OUT/"oof_CatBoost.csv", index=False, encoding="utf-8-sig")

    # 입력 패널 저장(재현성)
    data_out = data[["EMD_NAME_STD","ADM_CODE_STD","year",tgt] + feature_cols].copy()
    data_out.to_csv(OUT/"model_input_panel.csv", index=False, encoding="utf-8-sig")

run_cv_catboost(X, y, groups)
print("[DONE] 결과 저장 폴더:", OUT)


[INFO] EMD 경계 읽기


RuntimeError: 읍면동 키(이름/코드)를 찾지 못했습니다. 컬럼=['geometry']

In [20]:
#RandomBoost
from pathlib import Path
import warnings, re, math
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.inspection import permutation_importance
import joblib

# ========= 경로/옵션 =========
BASE = Path(r"C:\공모전\2025천안")
OUT  = BASE / "outputs" / "RandomForest"
OUT.mkdir(parents=True, exist_ok=True)

SHP_PATH = BASE / "천안시경계(24).shp"
POP_DIR  = BASE / "주민등록인구"  # 사용자 보유 데이터 폴더
SCHOOL_CSV   = BASE / "초중등학교 데이터" / "전국초중등학교위치표준데이터.csv"
BUS_CSV      = BASE / "버스정류장 위치정보" / "국토교통부_전국 버스정류장 위치정보.csv"
HOSPITAL_CSV = BASE / "병원정보서비스" / "병원정보서비스.csv"
FRESHMAN_DIR = BASE / "충원률"
YEARS = [2020, 2021, 2022, 2023, 2024]
DEFAULT_PROJ = "EPSG:5179"

TARGET_CANDIDATES = ["고령화율","aging_rate","65세이상비율","총인구","population","total_pop"]
EMD_NAME_CANDS = ["EMD_KOR_NM","EMD_NM","읍면동","EMD","adm_nm","EMD_NM_KR"]

# ========= 공통 유틸 =========
def pick_col(cols, cands):
    for c in cands:
        if c in cols: return c
    norm = {re.sub(r"\s+","",c).lower(): c for c in cols}
    for cand in cands:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    for c in cols:
        for cand in cands:
            if re.sub(r"\s+","",cand).lower() in re.sub(r"\s+","",c).lower():
                return c
    return None

def read_shp(path: Path) -> gpd.GeoDataFrame:
    g = gpd.read_file(path)
    if g.crs is None: g = g.set_crs(DEFAULT_PROJ, allow_override=True)
    return g

def detect_latlon(df):
    lat_cands = ["위도","lat","latitude","Y","y","WGS84위도","Y좌표","Y좌표(WGS84)"]
    lon_cands = ["경도","lon","longitude","X","x","WGS84경도","X좌표","X좌표(WGS84)"]
    lat = pick_col(df.columns, lat_cands); lon = pick_col(df.columns, lon_cands)
    return lat, lon

def to_points(df, lat, lon, crs="EPSG:4326"):
    d = df.dropna(subset=[lat,lon]).copy()
    d[lat]=pd.to_numeric(d[lat], errors="coerce"); d[lon]=pd.to_numeric(d[lon], errors="coerce")
    d = d.dropna(subset=[lat,lon])
    return gpd.GeoDataFrame(d, geometry=gpd.points_from_xy(d[lon], d[lat]), crs=crs)

def project_like(g, like):
    if g.crs is None: g = g.set_crs("EPSG:4326", allow_override=True)
    if like.crs is None: like = like.set_crs(DEFAULT_PROJ, allow_override=True)
    if g.crs != like.crs: g = g.to_crs(like.crs)
    return g

def zscore(s): return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

def standardize_emd_name(s):
    if pd.isna(s): return ""
    return re.sub(r"\s+","",str(s).strip())

def sjoin_counts_and_nearest(points_gdf, emd, prefix):
    emd_cent = emd.copy(); emd_cent["centroid"] = emd.geometry.centroid
    emd_cent = emd_cent.set_geometry("centroid")
    # 최근접 거리
    if len(points_gdf)>0:
        nn = gpd.sjoin_nearest(emd_cent, points_gdf[["geometry"]], how="left", distance_col=f"{prefix}_nearest_m")
        near = nn[[f"{prefix}_nearest_m"]].reindex(emd.index)
    else:
        near = pd.DataFrame({f"{prefix}_nearest_m":[np.nan]*len(emd)}, index=emd.index)
    # 포함 카운트
    if len(points_gdf)>0:
        jj = gpd.sjoin(emd, points_gdf[["geometry"]], how="left", predicate="contains")
        counts = jj.groupby(jj.index).size().reindex(emd.index).fillna(0).astype(int)
    else:
        counts = pd.Series(0, index=emd.index, dtype=int)
    out = pd.DataFrame(index=emd.index)
    out[f"{prefix}_count"]=counts
    area = (emd.geometry.area/1_000_000.0).replace(0,np.nan)
    out[f"{prefix}_dens_km2"] = out[f"{prefix}_count"]/area
    out[f"{prefix}_dens_log1p"]=np.log1p(out[f"{prefix}_dens_km2"])
    out = pd.concat([out, near], axis=1)
    out[f"{prefix}_nearest_km"]=out[f"{prefix}_nearest_m"]/1000.0
    out[f"{prefix}_nearest_log1p"]=np.log1p(out[f"{prefix}_nearest_km"])
    return out

def read_poi(csv_path, emd):
    if not csv_path.exists():
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    try: df = pd.read_csv(csv_path, encoding="utf-8-sig")
    except UnicodeDecodeError: df = pd.read_csv(csv_path, encoding="cp949")
    lat,lon = detect_latlon(df)
    if not (lat and lon): return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    g = to_points(df, lat, lon, crs="EPSG:4326")
    return project_like(g, emd)

def coerce_ym(s):
    if pd.isna(s): return (None,None)
    t=str(s).strip().replace(".","").replace("-","").replace("/","")
    if len(t)>=6 and t[:6].isdigit(): return (int(t[:4]), int(t[4:6]))
    if len(t)==4 and t.isdigit(): return (int(t),12)
    return (None,None)

def read_any_table(p: Path):
    if p.suffix.lower()==".csv":
        for enc in ("utf-8-sig","cp949","euc-kr"):
            try: return pd.read_csv(p, encoding=enc)
            except UnicodeDecodeError: continue
        return pd.read_csv(p)
    else:
        return pd.read_excel(p)

def find_pop_files(pop_dir: Path):
    pats=[r".*인구.*\.(csv|xlsx)$", r".*주민.*\.(csv|xlsx)$", r".*세대.*\.(csv|xlsx)$"]
    files=[]
    for fp in pop_dir.glob("**/*"):
        if fp.is_file() and any(re.match(p, fp.name, flags=re.I) for p in pats): files.append(fp)
    return files

def build_population(pop_dir, years, emd):
    files=find_pop_files(pop_dir); frames=[]
    for p in files:
        df=read_any_table(p)
        emd_col = pick_col(df.columns, ["읍면동","행정동","법정동","adm_nm","EMD","EMD_NM","동리명","법정동명"])
        if not emd_col: continue
        ym_col = pick_col(df.columns, ["기준년월","년월","연월","yyyymm","기준일자","date","집계년월"])
        year_col = pick_col(df.columns, ["연도","년도","year"])
        month_col= pick_col(df.columns, ["월","month"])
        if ym_col:
            ym=df[ym_col].map(coerce_ym); df["__year__"]=[y for y,_ in ym]; df["__month__"]=[m for _,m in ym]
        elif year_col:
            df["__year__"]=pd.to_numeric(df[year_col], errors="coerce")
            df["__month__"]=pd.to_numeric(df[month_col], errors="coerce") if month_col else 12
        else:
            m=re.search(r"(20\d{2})(0[1-9]|1[0-2])", p.stem)
            if m: df["__year__"]=int(m.group(1)); df["__month__"]=int(m.group(2))
            else:
                m2=re.search(r"(20\d{2})", p.stem)
                df["__year__"]=int(m2.group(1)) if m2 else np.nan; df["__month__"]=12
        total = pick_col(df.columns, ["총인구","인구","총인구수","population","total_pop"])
        aged  = pick_col(df.columns, ["65세이상","65세이상인구","고령인구","over65"])
        aging = pick_col(df.columns, ["고령화율","aging_rate","65세이상비율","ratio_65"])
        keep=[c for c in [emd_col,"__year__","__month__",total,aged,aging] if c]
        part=df[keep].copy()
        part["EMD_NAME_STD"]=part[emd_col].astype(str).map(standardize_emd_name)
        frames.append(part)
    if not frames: return pd.DataFrame()
    pop=pd.concat(frames, ignore_index=True)
    pop=pop.dropna(subset=["__year__"]).query("__year__ in @YEARS")
    # 월별이 있으면 최신월, 아니면 단일행
    pop=pop.sort_values(["EMD_NAME_STD","__year__","__month__"]).groupby(["EMD_NAME_STD","__year__"]).tail(1)
    # 고령화율 계산
    if pick_col(pop.columns, ["고령화율","aging_rate","65세이상비율","ratio_65"]) is None:
        tcol = pick_col(pop.columns, ["총인구","인구","총인구수","population","total_pop"])
        acol = pick_col(pop.columns, ["65세이상","65세이상인구","고령인구","over65"])
        if tcol and acol:
            pop["aging_rate"]=pd.to_numeric(pop[acol], errors="coerce")/pd.to_numeric(pop[tcol], errors="coerce")
    return pop

# ========= 데이터 준비 =========
print("[INFO] Load EMD boundary")
emd = read_shp(SHP_PATH)
emd_name = pick_col(emd.columns, EMD_NAME_CANDS) or EMD_NAME_CANDS[0]
emd["EMD_NAME_STD"]=emd[emd_name].astype(str).map(standardize_emd_name)
emd = emd.set_index("EMD_NAME_STD", drop=False)
emd["area_km2"]=(emd.geometry.area/1_000_000.0)

print("[INFO] Load POIs")
g_school = read_poi(SCHOOL_CSV, emd)
g_bus    = read_poi(BUS_CSV, emd)
g_hosp   = read_poi(HOSPITAL_CSV, emd)

print("[INFO] Build static features")
f_school = sjoin_counts_and_nearest(g_school, emd, "school")
f_bus    = sjoin_counts_and_nearest(g_bus, emd, "bus")
f_hosp   = sjoin_counts_and_nearest(g_hosp, emd, "hosp")
feat = pd.concat([f_school, f_bus, f_hosp], axis=1)
feat["school_z"]=zscore(feat["school_dens_km2"].fillna(0))
feat["bus_z"]   =zscore(feat["bus_dens_km2"].fillna(0))
feat["hosp_z"]  =zscore(feat["hosp_dens_km2"].fillna(0))
# 합성지표(가중치: Youth/SeniorCare)
feat["YouthIndex"]      = 0.5*feat["school_z"] + 0.3*feat["bus_z"] + 0.2*feat["hosp_z"]
feat["SeniorCareIndex"] = 0.6*feat["hosp_z"]  + 0.3*feat["bus_z"] + 0.1*feat["school_z"]
feat["area_km2"]=emd["area_km2"]

print("[INFO] Load population (annual)")
pop = build_population(POP_DIR, YEARS, emd)
if pop.empty: raise SystemExit("[ERROR] 인구 데이터가 비어있습니다. POP_DIR 내용을 확인하세요.")

# 연패널 구성(정적 피처는 연도별 복제)
panel=[]
for y in YEARS:
    tmp = feat.copy(); tmp["year"]=y; tmp["EMD_NAME_STD"]=tmp.index
    panel.append(tmp.reset_index(drop=True))
panel = pd.concat(panel, axis=0)

data = pop.merge(panel, on=["EMD_NAME_STD","year"], how="left")

# 타깃 선택
tgt=None
for c in TARGET_CANDIDATES:
    if c in data.columns: tgt=c; break
if tgt is None:
    raise SystemExit("[ERROR] 타깃 컬럼(고령화율/총인구 등)을 찾을 수 없습니다.")

feature_cols = [
    "school_dens_km2","bus_dens_km2","hosp_dens_km2",
    "school_dens_log1p","bus_dens_log1p","hosp_dens_log1p",
    "school_nearest_km","bus_nearest_km","hosp_nearest_km",
    "school_nearest_log1p","bus_nearest_log1p","hosp_nearest_log1p",
    "YouthIndex","SeniorCareIndex","area_km2"
]
feature_cols=[c for c in feature_cols if c in data.columns]
for c in feature_cols: data[c]=data[c].fillna(0)

X=data[feature_cols].copy()
y=pd.to_numeric(data[tgt], errors="coerce")
groups=data["year"]

# ========= 학습/평가 =========
def run_cv(model_name, model, X, y, groups):
    gkf=GroupKFold(n_splits=len(YEARS))
    oof=np.full(len(y), np.nan)
    rows=[]
    for i,(tr,va) in enumerate(gkf.split(X,y,groups),1):
        model.fit(X.iloc[tr], y.iloc[tr])
        pred=model.predict(X.iloc[va])
        oof[va]=pred
        r2=r2_score(y.iloc[va],pred)
        mae=mean_absolute_error(y.iloc[va],pred)
        rmse=mean_squared_error(y.iloc[va],pred,squared=False)
        yr=int(groups.iloc[va].iloc[0])
        rows.append({"fold":i,"valid_year":yr,"R2":r2,"MAE":mae,"RMSE":rmse})
        print(f"[{model_name}] Fold{i} (valid {yr})  R2={r2:.3f}  MAE={mae:.3f}  RMSE={rmse:.3f}")
    pd.DataFrame(rows).to_csv(OUT/f"cv_{model_name}.csv", index=False, encoding="utf-8-sig")
    # Permutation importance
    try:
        imp=permutation_importance(model, X, y, n_repeats=10, random_state=42)
        pd.DataFrame({"feature":X.columns,
                      "perm_importance_mean":imp.importances_mean,
                      "perm_importance_std":imp.importances_std})\
          .sort_values("perm_importance_mean", ascending=False)\
          .to_csv(OUT/f"feat_importance_{model_name}.csv", index=False, encoding="utf-8-sig")
    except Exception as e:
        print("[WARN] permutation importance 실패:", e)
    out = data[["EMD_NAME_STD","year"]].copy()
    out["y_true"]=y.values; out["y_pred"]=oof
    out.to_csv(OUT/f"oof_{model_name}.csv", index=False, encoding="utf-8-sig")

rf = RandomForestRegressor(
    n_estimators=700, max_depth=None, max_features="sqrt",
    min_samples_split=2, min_samples_leaf=1,
    random_state=42, n_jobs=-1
)
run_cv("RandomForest", rf, X, y, groups)

# ========= 전체학습 & 저장 =========
rf.fit(X,y)
joblib.dump(rf, OUT/"final_rf.pkl")
data_out = data[["EMD_NAME_STD","year",tgt]+feature_cols]
data_out.to_csv(OUT/"model_input_panel.csv", index=False, encoding="utf-8-sig")
print("[DONE] RandomForest outputs saved →", OUT)


[INFO] Load EMD boundary


KeyError: 'EMD_KOR_NM'

In [None]:

# LightGBM

from pathlib import Path
import warnings, re, os
warnings.filterwarnings("ignore")

import pandas as pd, numpy as np, geopandas as gpd
from shapely.geometry import Point
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.inspection import permutation_importance
from lightgbm import LGBMRegressor
import joblib

# ================== 경로/상수 ==================
BASE = Path(r"C:\공모전\2025천안")
OUT  = BASE / "outputs" / "LightGBM_3_1_1"; OUT.mkdir(parents=True, exist_ok=True)

# ▶ POP 입력(둘 중 택1): (A) 폴더 스캔 or (B) 단일 CSV 직접 지정
# (A) 폴더 스캔 버전
POP_DIR  = BASE / "주민등록인구"

# (B) 단일 CSV 버전: 아래 두 줄 주석 해제하고 사용 (파일 경로 정확히!)
# POP_FILE = BASE / "시군구별 이동자" / "202001_202412_주민등록인구및세대현황_월간.csv"
# POP_INPUT_IS_FILE = True
POP_INPUT_IS_FILE = False  # (A) 폴더 스캔 사용 시 False

SHP_PATH = BASE / "천안시경계(24).shp"   # .shx 없어도 자동 복구 시도
SCHOOL_CSV   = BASE / "초중등학교 데이터" / "전국초중등학교위치표준데이터.csv"
BUS_CSV      = BASE / "버스정류장 위치정보" / "국토교통부_전국 버스정류장 위치정보.csv"
HOSPITAL_CSV = BASE / "병원정보서비스" / "병원정보서비스.csv"

# 기본 연도 세트 (필요 시 2025 추가 가능)
YEARS = [2020, 2021, 2022, 2023, 2024]
DEFAULT_PROJ = "EPSG:5179"  # Korea 2000 / Unified CS
RANDOM_STATE = 42

TARGET_CANDIDATES = ["고령화율","aging_rate","65세이상비율","총인구","population","total_pop"]

# shp 읽을 때 .shx 자동 복구 허용
os.environ["SHAPE_RESTORE_SHX"] = "YES"

# 이름/코드 컬럼 후보(경계/인구 모두 대응)
EMD_NAME_CANDS = [
    "EMD_NM","EMD_KOR_NM","EMD_NAME","EMD_NM_KR","EMD_NM_KOR","EMD_NM_KO",
    "읍면동","읍면동명","행정동","행정동명","법정동","법정동명","adm_nm","adm_dr_nm","NAME","name","동리명"
]
EMD_CODE_CANDS = [
    "EMD_CD","EMDCD","ADM_CD","adm_cd","법정동코드","행정동코드","법정리코드","법정동코드(10자리)","HCODE","CODE"
]
POP_NAME_CANDS = ["읍면동","읍면동명","행정동","행정동명","법정동","법정동명","adm_nm","EMD_NM","동리명","NAME","name"]
POP_CODE_CANDS = ["행정동코드","법정동코드","EMD_CD","adm_cd","CODE","HCODE"]

SIDO_CANDS = ["시도","시도명","광역시도","행정구역(시도)"]
SIG_CANDS  = ["시군구","시군구명","자치구","구","군","행정구역(시군구)"]

# ================== 유틸 함수 ==================
def pick_col(cols, cands):
    """정확일치 → 공백/대소문자 무시 → 부분포함 순으로 첫 컬럼 반환"""
    for c in cands:
        if c in cols: return c
    norm = {re.sub(r"\s+","",c).lower(): c for c in cols}
    for cand in cands:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    for c in cols:
        for cand in cands:
            if re.sub(r"\s+","",cand).lower() in re.sub(r"\s+","",c).lower():
                return c
    return None

def standardize_emd_name(s):
    if pd.isna(s): return ""
    s = str(s).strip()
    s = re.sub(r"\s+","", s)
    # 필요하면 접두부(천안시동남구/서북구) 제거 룰을 추가 가능
    return s

def normalize_adm_code(series, zfill=10):
    """숫자만 남겨 10자리(법정동코드 관용) zero-fill"""
    return series.astype(str).str.replace(r"\D","", regex=True).str.zfill(zfill)

def guess_emd_name_col(gdf: gpd.GeoDataFrame):
    """한글+동/읍/면/리로 끝나는 비율 높은 문자열 컬럼 자동 후보"""
    cand=[]
    for c in gdf.columns:
        if gdf[c].dtype == "object":
            s = gdf[c].astype(str).head(200)
            ratio = s.str.contains(r"(동|읍|면|리)$").mean()
            if ratio >= 0.2:
                cand.append((ratio, c))
    if cand:
        cand.sort(reverse=True)
        return cand[0][1]
    return None

def read_shp_with_restore(shp_path: str, default_crs=DEFAULT_PROJ):
    import fiona
    with fiona.Env(SHAPE_RESTORE_SHX="YES"):
        g = gpd.read_file(shp_path)
    if g.crs is None:
        g = g.set_crs(default_crs, allow_override=True)
    return g

def resolve_emd_keys_from_shp(shp_path: str) -> gpd.GeoDataFrame:
    """경계에서 이름/코드 표준 키 생성 + 면적 km² 추가"""
    g = read_shp_with_restore(shp_path)
    code_col = pick_col(g.columns, EMD_CODE_CANDS)
    name_col = pick_col(g.columns, EMD_NAME_CANDS) or guess_emd_name_col(g)
    if code_col is not None:
        g["ADM_CODE_STD"] = normalize_adm_code(g[code_col])
    if name_col is not None:
        g["EMD_NAME_STD"] = g[name_col].astype(str).map(standardize_emd_name)
    if ("ADM_CODE_STD" not in g.columns) and ("EMD_NAME_STD" not in g.columns):
        raise RuntimeError(f"읍면동 키(이름/코드)를 찾지 못했습니다. 컬럼={list(g.columns)}")
    g["area_km2"] = g.geometry.area / 1_000_000.0
    return g

def detect_latlon(df: pd.DataFrame):
    lat_cands = ["위도","lat","latitude","Y","y","WGS84위도","Y좌표","Y좌표(WGS84)"]
    lon_cands = ["경도","lon","longitude","X","x","WGS84경도","X좌표","X좌표(WGS84)"]
    lat = pick_col(df.columns, lat_cands)
    lon = pick_col(df.columns, lon_cands)
    return lat, lon

def to_points(df: pd.DataFrame, lat: str, lon: str, crs="EPSG:4326") -> gpd.GeoDataFrame:
    d = df.dropna(subset=[lat, lon]).copy()
    d[lat] = pd.to_numeric(d[lat], errors="coerce")
    d[lon] = pd.to_numeric(d[lon], errors="coerce")
    d = d.dropna(subset=[lat, lon])
    return gpd.GeoDataFrame(d, geometry=gpd.points_from_xy(d[lon], d[lat]), crs=crs)

def project_like(g: gpd.GeoDataFrame, like: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    if g.crs is None: g = g.set_crs("EPSG:4326", allow_override=True)
    if like.crs is None: like = like.set_crs(DEFAULT_PROJ, allow_override=True)
    if g.crs != like.crs: g = g.to_crs(like.crs)
    return g

def sjoin_counts_and_nearest(points_gdf, emd, prefix):
    """EMD 내 포함 카운트, km²당 밀도, 최근접거리(km) 및 로그보조"""
    emd_cent = emd.copy(); emd_cent["centroid"] = emd.geometry.centroid
    emd_cent = emd_cent.set_geometry("centroid")
    # 최근접 거리
    if len(points_gdf) > 0:
        nn = gpd.sjoin_nearest(emd_cent, points_gdf[["geometry"]], how="left", distance_col=f"{prefix}_nearest_m")
        near = nn[[f"{prefix}_nearest_m"]].reindex(emd.index)
    else:
        near = pd.DataFrame({f"{prefix}_nearest_m":[np.nan]*len(emd)}, index=emd.index)
    # 포함 카운트
    if len(points_gdf) > 0:
        jj = gpd.sjoin(emd, points_gdf[["geometry"]], how="left", predicate="contains")
        counts = jj.groupby(jj.index).size().reindex(emd.index).fillna(0).astype(int)
    else:
        counts = pd.Series(0, index=emd.index, dtype=int)
    out = pd.DataFrame(index=emd.index)
    out[f"{prefix}_count"] = counts
    area = (emd.geometry.area/1_000_000.0).replace(0, np.nan)
    out[f"{prefix}_dens_km2"] = out[f"{prefix}_count"] / area
    out[f"{prefix}_dens_log1p"] = np.log1p(out[f"{prefix}_dens_km2"])
    out = pd.concat([out, near], axis=1)
    out[f"{prefix}_nearest_km"] = out[f"{prefix}_nearest_m"] / 1000.0
    out[f"{prefix}_nearest_log1p"] = np.log1p(out[f"{prefix}_nearest_km"])
    return out

def zscore(s: pd.Series):
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

def read_any_table(p: Path) -> pd.DataFrame:
    if p.suffix.lower() == ".csv":
        for enc in ("utf-8-sig","cp949","euc-kr"):
            try: return pd.read_csv(p, encoding=enc)
            except UnicodeDecodeError: continue
        return pd.read_csv(p)
    else:
        return pd.read_excel(p)

def find_pop_files(pop_dir: Path):
    pats=[r".*인구.*\.(csv|xlsx)$", r".*주민.*\.(csv|xlsx)$", r".*세대.*\.(csv|xlsx)$"]
    files=[]
    for fp in pop_dir.glob("**/*"):
        if fp.is_file() and any(re.match(p, fp.name, flags=re.I) for p in pats):
            files.append(fp)
    return files

def coerce_ym(s):
    if pd.isna(s): return (None,None)
    t=str(s).strip().replace(".","").replace("-","").replace("/","")
    if len(t)>=6 and t[:6].isdigit(): return (int(t[:4]), int(t[4:6]))
    if len(t)==4 and t.isdigit(): return (int(t),12)
    return (None,None)

def resolve_keys_from_pop(df_in: pd.DataFrame):
    """인구 표에서 이름/코드 표준 키 생성(없으면 패턴으로 이름 유추)"""
    df = df_in.copy()
    code_col = pick_col(df.columns, POP_CODE_CANDS)
    name_col = pick_col(df.columns, POP_NAME_CANDS)
    if code_col is not None:
        df["ADM_CODE_STD"] = normalize_adm_code(df[code_col])
    if name_col is not None:
        df["EMD_NAME_STD"] = df[name_col].astype(str).map(standardize_emd_name)
    if ("ADM_CODE_STD" not in df.columns) and ("EMD_NAME_STD" not in df.columns):
        # 값 패턴으로 이름 추정
        for c in df.columns:
            if df[c].dtype == "object":
                s = df[c].astype(str).head(200)
                if s.str.contains(r"(동|읍|면|리)$").mean() >= 0.2:
                    df["EMD_NAME_STD"] = df[c].astype(str).map(standardize_emd_name)
                    break
    return df

def pick_pop_value_cols(df: pd.DataFrame):
    total = pick_col(df.columns, ["총인구","인구","총인구수","population","total_pop"])
    aged  = pick_col(df.columns, ["65세이상","65세이상인구","고령인구","over65"])
    aging = pick_col(df.columns, ["고령화율","aging_rate","65세이상비율","ratio_65"])
    return total, aged, aging

# ===== 인구 로더 (A) 폴더 스캔 버전 =====
def build_population_from_dir(pop_dir: Path, years: list):
    files = find_pop_files(pop_dir)
    if not files:
        print(f"[WARN] 인구 파일이 없습니다: {pop_dir}")
        return pd.DataFrame()
    frames=[]
    for p in files:
        df = read_any_table(p)
        df = resolve_keys_from_pop(df)
        ym_col = pick_col(df.columns, ["기준년월","년월","연월","yyyymm","기준일자","date","집계년월"])
        year_col = pick_col(df.columns, ["연도","년도","year","Year"])
        month_col= pick_col(df.columns, ["월","month","Month"])
        if ym_col:
            ym=df[ym_col].map(coerce_ym); df["__year__"]=[y for y,_ in ym]; df["__month__"]=[m for _,m in ym]
        elif year_col:
            df["__year__"]=pd.to_numeric(df[year_col], errors="coerce")
            df["__month__"]=pd.to_numeric(df[month_col], errors="coerce") if month_col else 12
        else:
            m=re.search(r"(20\d{2})(0[1-9]|1[0-2])", p.stem)
            if m: df["__year__"]=int(m.group(1)); df["__month__"]=int(m.group(2))
            else:
                m2=re.search(r"(20\d{2})", p.stem)
                df["__year__"]=int(m2.group(1)) if m2 else np.nan; df["__month__"]=12
        total, aged, aging = pick_pop_value_cols(df)
        keep = ["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"] + [c for c in [total, aged, aging] if c]
        frames.append(df[keep].copy())

    pop = pd.concat(frames, ignore_index=True)
    pop = pop.dropna(subset=["__year__"])
    pop = pop[pop["__year__"].isin(years)]
    pop = pop.sort_values(["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"])
    pop = pop.groupby(["ADM_CODE_STD","EMD_NAME_STD","__year__"], as_index=False).tail(1)

    if "aging_rate" not in pop.columns:
        tcol, acol = pick_col(pop.columns, ["총인구","인구","총인구수","population","total_pop"]), pick_col(pop.columns, ["65세이상","65세이상인구","고령인구","over65"])
        if tcol and acol:
            pop["aging_rate"] = pd.to_numeric(pop[acol], errors="coerce")/pd.to_numeric(pop[tcol], errors="coerce")
    pop = pop.rename(columns={"__year__":"year"})
    return pop

# ===== 인구 로더 (B) 단일 CSV 버전 =====
def build_population_from_single_csv(pop_file: Path, years: list) -> pd.DataFrame:
    df = read_any_table(pop_file)
    df = resolve_keys_from_pop(df)

    # (선택) 지역 필터: 충청남도 & 천안시 동남구/서북구에 한정
    sido_col = pick_col(df.columns, SIDO_CANDS)
    sig_col  = pick_col(df.columns, SIG_CANDS)
    if sido_col is not None:
        df = df[df[sido_col].astype(str).str.contains("충청남도", na=False)]
    if sig_col is not None:
        df = df[df[sig_col].astype(str).isin(["천안시 동남구","천안시 서북구"])]

    ym_col    = pick_col(df.columns, ["기준년월","년월","연월","yyyymm","기준일자","date","집계년월"])
    year_col  = pick_col(df.columns, ["연도","년도","year","Year"])
    month_col = pick_col(df.columns, ["월","month","Month"])
    if ym_col:
        ym=df[ym_col].map(coerce_ym); df["__year__"]=[y for y,_ in ym]; df["__month__"]=[m for _,m in ym]
    elif year_col:
        df["__year__"]=pd.to_numeric(df[year_col], errors="coerce")
        df["__month__"]=pd.to_numeric(df[month_col], errors="coerce") if month_col else 12
    else:
        m=re.search(r"(20\d{2})(0[1-9]|1[0-2])", pop_file.stem)
        if m: df["__year__"]=int(m.group(1)); df["__month__"]=int(m.group(2))
        else:
            m2=re.search(r"(20\d{2})", pop_file.stem)
            df["__year__"]=int(m2.group(1)) if m2 else np.nan; df["__month__"]=12

    total, aged, aging = pick_pop_value_cols(df)
    has_emd = ("EMD_NAME_STD" in df.columns and df["EMD_NAME_STD"].notna().any()) or \
              ("ADM_CODE_STD" in df.columns and df["ADM_CODE_STD"].str.len().fillna(0).astype(int).ge(8).any())

    df = df.dropna(subset=["__year__"])
    df = df[df["__year__"].isin(years)]
    df = df.sort_values(["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__"])

    if has_emd:
        keep = [c for c in ["ADM_CODE_STD","EMD_NAME_STD","__year__","__month__", total, aged, aging] if c]
        part = df[keep].copy()
        part = part.groupby(["ADM_CODE_STD","EMD_NAME_STD","__year__"], as_index=False).tail(1)
        if (aging is None) and total and aged:
            part["aging_rate"] = pd.to_numeric(part[aged], errors="coerce") / pd.to_numeric(part[total], errors="coerce")
        part = part.rename(columns={"__year__":"year"})
        return part
    else:
        raise SystemExit("단일 CSV가 시군구 수준으로 보입니다. EMD 레벨 키가 없어 EMD 모델에 곧바로 쓸 수 없습니다.")

# ================== 데이터 로드/피처 구성 ==================
print("[INFO] 1) EMD 경계 읽기 & 키 생성")
emd = resolve_emd_keys_from_shp(str(SHP_PATH))
BOUNDARY_HAS_CODE = "ADM_CODE_STD" in emd.columns
BOUNDARY_HAS_NAME = "EMD_NAME_STD" in emd.columns
idx_col = "EMD_NAME_STD" if BOUNDARY_HAS_NAME else "ADM_CODE_STD"
emd = emd.set_index(idx_col, drop=False)

print("[INFO] 2) POI 읽기 → 포인트 변환 → 경계 좌표계로 투영")
def read_poi(csv_path: Path, emd):
    if not csv_path.exists():
        print(f"[WARN] POI 파일 없음: {csv_path.name}")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    try: df = pd.read_csv(csv_path, encoding="utf-8-sig")
    except UnicodeDecodeError: df = pd.read_csv(csv_path, encoding="cp949")
    lat, lon = detect_latlon(df)
    if not (lat and lon):
        print(f"[WARN] 위경도 컬럼 자동탐색 실패 → 빈 처리: {csv_path.name}")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    g = to_points(df, lat, lon, crs="EPSG:4326")
    return project_like(g, emd)

g_school = read_poi(SCHOOL_CSV, emd)
g_bus    = read_poi(BUS_CSV, emd)
g_hosp   = read_poi(HOSPITAL_CSV, emd)

print("[INFO] 3) EMD별 정적 피처(카운트/밀도/최근접거리) + 합성지표")
f_school = sjoin_counts_and_nearest(g_school, emd, "school")
f_bus    = sjoin_counts_and_nearest(g_bus, emd, "bus")
f_hosp   = sjoin_counts_and_nearest(g_hosp, emd, "hosp")
feat_static = pd.concat([f_school, f_bus, f_hosp], axis=1)

feat_static["school_z"] = zscore(feat_static["school_dens_km2"].fillna(0))
feat_static["bus_z"]    = zscore(feat_static["bus_dens_km2"].fillna(0))
feat_static["hosp_z"]   = zscore(feat_static["hosp_dens_km2"].fillna(0))

# 합성지표(도메인 priors)
feat_static["YouthIndex"]      = 0.5*feat_static["school_z"] + 0.3*feat_static["bus_z"] + 0.2*feat_static["hosp_z"]
feat_static["SeniorCareIndex"] = 0.6*feat_static["hosp_z"]  + 0.3*feat_static["bus_z"] + 0.1*feat_static["school_z"]
feat_static["area_km2"] = emd["area_km2"]

print("[INFO] 4) 인구 표 읽기/연도화")
if POP_INPUT_IS_FILE:
    pop_yearly = build_population_from_single_csv(POP_FILE, YEARS)
else:
    pop_yearly = build_population_from_dir(POP_DIR, YEARS)

if pop_yearly.empty:
    raise SystemExit("[ERROR] 인구 데이터가 비었습니다. POP 입력을 확인하세요.")

print("[INFO] 5) 연패널 구성(정적 피처 연도별 복제) + 병합")
panel=[]
for y in YEARS:
    tmp = feat_static.copy()
    tmp["year"] = y
    if "EMD_NAME_STD" in emd.columns: tmp["EMD_NAME_STD"] = emd["EMD_NAME_STD"]
    if "ADM_CODE_STD" in emd.columns: tmp["ADM_CODE_STD"] = emd["ADM_CODE_STD"]
    panel.append(tmp.reset_index(drop=True))
panel = pd.concat(panel, axis=0)

# 병합 키 결정 (코드 우선, 그다음 이름)
POP_HAS_CODE = "ADM_CODE_STD" in pop_yearly.columns
POP_HAS_NAME = "EMD_NAME_STD" in pop_yearly.columns
BOUNDARY_HAS_CODE = "ADM_CODE_STD" in panel.columns
BOUNDARY_HAS_NAME = "EMD_NAME_STD" in panel.columns

if BOUNDARY_HAS_CODE and POP_HAS_CODE:
    key = "ADM_CODE_STD"
elif BOUNDARY_HAS_NAME and POP_HAS_NAME:
    key = "EMD_NAME_STD"
else:
    key = "EMD_NAME_STD" if POP_HAS_NAME else "ADM_CODE_STD"

data = pop_yearly.merge(panel, on=[key,"year"], how="left")

# ================== 타깃/피처(X, y, groups) ==================
tgt=None
for c in TARGET_CANDIDATES:
    if c in data.columns:
        tgt=c; break
if tgt is None:
    raise SystemExit("[ERROR] 타깃 컬럼(고령화율/총인구 등)이 데이터에 없습니다.")

feature_cols = [
    "school_dens_km2","bus_dens_km2","hosp_dens_km2",
    "school_dens_log1p","bus_dens_log1p","hosp_dens_log1p",
    "school_nearest_km","bus_nearest_km","hosp_nearest_km",
    "school_nearest_log1p","bus_nearest_log1p","hosp_nearest_log1p",
    "YouthIndex","SeniorCareIndex","area_km2",
]
feature_cols = [c for c in feature_cols if c in data.columns]
for c in feature_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce").fillna(0)

y_all = pd.to_numeric(data[tgt], errors="coerce")
mask = ~y_all.isna()
data = data.loc[mask].reset_index(drop=True)
y_all = y_all.loc[mask].reset_index(drop=True)
X_all = data[feature_cols].copy()
years_col = data["year"].astype(int)

print(f"[INFO] Target={tgt} | X={X_all.shape} | Years={sorted(years_col.unique())}")
print("[DEBUG] Features:", feature_cols)

# ================== 3:1:1 연도 분리 함수 ==================
def split_years_3_1_1(years_sorted):
    """
    years_sorted: 정렬된 유니크 연도 리스트
    3:1:1 비율로 (Train, Valid, Test) 연도 집합을 반환.
    예) [2020,2021,2022,2023,2024] → (2020~2022, 2023, 2024)
    """
    k = len(years_sorted)
    if k < 3:
        # 연도가 3개 미만이면 보수적 분리
        if k == 2:
            return years_sorted[:1], [years_sorted[1]], []   # Train=첫해, Valid=둘째해, Test=없음(경고)
        elif k == 1:
            return [years_sorted[0]], [], []                 # 전부 Train
        else:
            return [], [], []
    # 60/20/20 할당 (반올림 대신 고정 규칙: 앞 60%, 다음 20%, 끝 20%)
    n_train = max(1, int(round(k * 0.6)))
    n_valid = max(1, int(round(k * 0.2)))
    # 합이 k를 초과/미만할 수 있어 보정
    if n_train + n_valid > k - 1:         # 최소 1년은 test 확보
        n_valid = max(1, k - n_train - 1)
    n_test = k - n_train - n_valid
    if n_test == 0:                        # 그래도 0이면 train에서 1년 빼서 test로
        n_train = max(1, n_train - 1); n_test = 1
    train_years = years_sorted[:n_train]
    valid_years = years_sorted[n_train:n_train+n_valid]
    test_years  = years_sorted[n_train+n_valid:]
    return train_years, valid_years, test_years

years_sorted = sorted(years_col.unique().tolist())
train_years, valid_years, test_years = split_years_3_1_1(years_sorted)

print(f"[INFO] Split years → Train={train_years} | Valid={valid_years} | Test={test_years}")

tr_mask = years_col.isin(train_years)
va_mask = years_col.isin(valid_years)
te_mask = years_col.isin(test_years)

X_tr, y_tr = X_all[tr_mask], y_all[tr_mask]
X_va, y_va = X_all[va_mask], y_all[va_mask]
X_te, y_te = X_all[te_mask], y_all[te_mask]

# ================== LightGBM 설정 ==================
lgbm_val = LGBMRegressor(
    n_estimators=1600,
    learning_rate=0.03,
    num_leaves=63,
    max_depth=-1,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=1e-2,
    reg_lambda=1e-1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# ---------- 1) Train → Valid 성능 ----------
print("[INFO] Fit on TRAIN → evaluate on VALID")
lgbm_val.fit(X_tr, y_tr)
pred_va = lgbm_val.predict(X_va)

va_r2   = r2_score(y_va, pred_va) if len(y_va)>0 else np.nan
va_mae  = mean_absolute_error(y_va, pred_va) if len(y_va)>0 else np.nan
va_rmse = mean_squared_error(y_va, pred_va, squared=False) if len(y_va)>0 else np.nan
print(f"[VALID] R2={va_r2:.4f}  MAE={va_mae:.4f}  RMSE={va_rmse:.4f}")

# ---------- 2) Train+Valid → Test 성능 ----------
print("[INFO] Fit on TRAIN+VALID → evaluate on TEST")
X_trva = pd.concat([X_tr, X_va], axis=0)
y_trva = pd.concat([y_tr, y_va], axis=0)

lgbm_test = LGBMRegressor(
    n_estimators=1800,
    learning_rate=0.03,
    num_leaves=63,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1e-2,
    reg_lambda=1e-1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
lgbm_test.fit(X_trva, y_trva)
pred_te = lgbm_test.predict(X_te)

te_r2   = r2_score(y_te, pred_te) if len(y_te)>0 else np.nan
te_mae  = mean_absolute_error(y_te, pred_te) if len(y_te)>0 else np.nan
te_rmse = mean_squared_error(y_te, pred_te, squared=False) if len(y_te)>0 else np.nan
print(f"[TEST]  R2={te_r2:.4f}  MAE={te_mae:.4f}  RMSE={te_rmse:.4f}")

# ================== 결과 저장 ==================
# 1) 메트릭 저장
metrics = pd.DataFrame([
    {"split":"valid","R2":va_r2,"MAE":va_mae,"RMSE":va_rmse,"years":",".join(map(str,valid_years))},
    {"split":"test","R2":te_r2,"MAE":te_mae,"RMSE":te_rmse,"years":",".join(map(str,test_years))}
])
metrics.to_csv(OUT/"metrics_lgbm_3_1_1.csv", index=False, encoding="utf-8-sig")

# 2) 중요도 (Permutation: VALID 기준, 내장 중요도: Train+Valid 모델)
try:
    if len(y_va)>0:
        imp_perm = permutation_importance(lgbm_val, X_va, y_va, n_repeats=10, random_state=RANDOM_STATE)
        pd.DataFrame({"feature":X_va.columns,
                      "perm_importance_mean":imp_perm.importances_mean,
                      "perm_importance_std":imp_perm.importances_std})\
          .sort_values("perm_importance_mean", ascending=False)\
          .to_csv(OUT/"feat_importance_perm_VALID.csv", index=False, encoding="utf-8-sig")
except Exception as e:
    print("[WARN] VALID permutation importance 실패:", e)

try:
    fi = getattr(lgbm_test, "feature_importances_", None)
    if fi is not None:
        pd.DataFrame({"feature":X_trva.columns, "lgbm_importance":fi})\
          .sort_values("lgbm_importance", ascending=False)\
          .to_csv(OUT/"feat_importance_lgbm_train_valid.csv", index=False, encoding="utf-8-sig")
except Exception as e:
    print("[WARN] 내장 중요도 저장 실패:", e)

# 3) 예측 저장
pred_df = pd.DataFrame({
    "EMD_NAME_STD": data["EMD_NAME_STD"] if "EMD_NAME_STD" in data.columns else np.nan,
    "ADM_CODE_STD": data["ADM_CODE_STD"] if "ADM_CODE_STD" in data.columns else np.nan,
    "year": years_col,
    "split": np.where(tr_mask, "train", np.where(va_mask, "valid", np.where(te_mask, "test", "unused"))),
    "y_true": y_all
})

# 분할별 예측
pred_df.loc[va_mask,  "y_pred"] = pred_va
pred_df.loc[te_mask,  "y_pred"] = pred_te
# (train 예측이 필요하면 아래 두 줄 주석 해제)
# pred_df.loc[tr_mask,  "y_pred"] = lgbm_val.predict(X_tr)

pred_df.to_csv(OUT/"predictions_by_split.csv", index=False, encoding="utf-8-sig")

# 4) 최종 모델 저장 (Train+Valid 전체로 학습된 모델)
joblib.dump(lgbm_test, OUT/"final_lgbm_train_valid.pkl")

# 5) 재현성용 입력 패널 저장
panel_cols = [c for c in ["EMD_NAME_STD","ADM_CODE_STD","year",tgt]+feature_cols if c in data.columns]
data[panel_cols].to_csv(OUT/"model_input_panel.csv", index=False, encoding="utf-8-sig")

print("[DONE] outputs →", OUT)


[INFO] 1) EMD 경계 읽기 & 키 생성


RuntimeError: 읍면동 키(이름/코드)를 찾지 못했습니다. 컬럼=['geometry']