### 수출입통계

In [12]:
# Retry: save CSV only (parquet engine not available in this environment) and show a preview table.
import os, pandas as pd

csv_path = "/Users/kjh/Documents/geostat3_e9/trade_panel_2008_2025.csv"
# 'combined' may not exist if the previous cell errored before creation; rebuild quickly.
import re, numpy as np
from glob import glob

def read_any_encoding(path, **kwargs):
    try:
        return pd.read_csv(path, **kwargs)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="cp949", **kwargs)

base = "/Users/kjh/Documents/geostat3_e9/data/"
paths = sorted(glob(os.path.join(base, "export_*.csv")))

frames = []
for p in paths:
    m = re.search(r"export_(\d{4})\.csv$", os.path.basename(p))
    year_from_name = int(m.group(1)) if m else None
    df = read_any_encoding(p)
    df.columns = [c.strip() for c in df.columns]
    if "기간" in df.columns:
        df = df[df["기간"].astype(str).str.strip() != "총계"].copy()
        df["time"] = pd.to_datetime(df["기간"], format="%Y-%m", errors="coerce")
    else:
        df["time"] = pd.NaT
    if "지역" in df.columns:
        df = df.rename(columns={"지역":"region"})
    else:
        df["region"] = np.nan
    ren = {"수출 건수":"export_cnt","수출 금액":"export_val","수입 건수":"import_cnt","수입 금액":"import_val","무역수지":"trade_balance"}
    for k,v in ren.items():
        if k in df.columns:
            df = df.rename(columns={k:v})
    keep = ["time","region","export_cnt","export_val","import_cnt","import_val","trade_balance"]
    for k in keep:
        if k not in df.columns:
            df[k] = np.nan
    out = df[keep].copy()
    out["year"] = out["time"].dt.year
    out["month"] = out["time"].dt.month
    out["file_year"] = year_from_name
    frames.append(out)

combined = pd.concat(frames, ignore_index=True).drop_duplicates().sort_values(["time"]).reset_index(drop=True)
combined.to_csv(csv_path, index=False, encoding="utf-8-sig")

print("Saved combined CSV to:", csv_path)
csv_path


Saved combined CSV to: /Users/kjh/Documents/geostat3_e9/trade_panel_2008_2025.csv


'/Users/kjh/Documents/geostat3_e9/trade_panel_2008_2025.csv'

### 제조업생산지수

In [42]:
# Parse the specific header style (e.g., 'M200801 2008.01') and reshape to long.
import pandas as pd, numpy as np, re

path = "/Users/kjh/Documents/geostat3_e9/data/시도_재별_제조업생산지수_2020100__20251103235729.xlsx"
sheet = "데이터"

df = pd.read_excel(path, sheet_name=sheet, header=0, dtype=str)
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
df.columns = [str(c).strip() for c in df.columns]

# Identify columns
region_col = next((c for c in df.columns if "시도" in c), df.columns[0])
class_col  = next((c for c in df.columns if "재별" in c or "분류" in c), None)

# Period columns like 'M200801 2008.01'
period_cols = [c for c in df.columns if re.match(r'^M\d{6}\s+\d{4}\.\d{2}$', c)]

# Create a mapping from col -> datetime
def col_to_time(col):
    m = re.match(r'^M(\d{6})\s+(\d{4})\.(\d{2})$', col)
    if m:
        yyyymm = m.group(1)
        yyyy, mm = yyyymm[:4], yyyymm[4:]
        return pd.Timestamp(f"{yyyy}-{mm}-01")
    m2 = re.search(r'(\d{4})(\d{2})', col)  # fallback
    if m2:
        return pd.Timestamp(f"{m2.group(1)}-{m2.group(2)}-01")
    return pd.NaT

time_map = {c: col_to_time(c) for c in period_cols}

# Keep subset & rename
keep = [region_col] + ([class_col] if class_col else []) + period_cols
wide = df[keep].copy().rename(
    columns={region_col: "region", class_col: "category"} if class_col else {region_col: "region"}
)

# --- Clean region ---
# 1) region 결측(빈 문자열 포함) 위→아래 채우기 (중간재/소비재 등 하위행에 지역 상속)
wide["region"] = wide["region"].replace(r"^\s*$", np.nan, regex=True).ffill()

# 2) region 앞 숫자코드 제거 (예: "00 전국", "11_서울특별시" → "전국", "서울특별시")
def clean_leading_code(s: str) -> str:
    if s is None:
        return s
    s = str(s).strip()
    s = re.sub(r'^\s*\d+\s*[-_.]?\s*', '', s)  # 선행 숫자(+구분자) 제거
    return s.strip()

wide["region"] = wide["region"].map(clean_leading_code)

# 3) 전국 제외
wide = wide[~wide["region"].str.contains(r"^전국$", na=False)]

# --- Clean category (숫자 코드 제거: "1 중간재" → "중간재") ---
if "category" in wide.columns:
    wide["category"] = wide["category"].fillna("").astype(str).str.strip()
    # 선행 숫자(+구분자/공백) 제거
    wide["category"] = wide["category"].str.replace(r'^\s*\d+\s*[-_.]?\s*', '', regex=True).str.strip()
    # 완전 빈값은 NaN으로
    wide["category"] = wide["category"].replace({"": np.nan})

# Melt
id_vars = ["region"] + (["category"] if "category" in wide.columns else [])
long = wide.melt(id_vars=id_vars, var_name="col", value_name="manufacturing_production_index")

# Map to datetime
long["time"] = long["col"].map(time_map)
long = long.dropna(subset=["time"]).copy()

# Clean values
vals = long["manufacturing_production_index"].astype(str).str.replace(",", "", regex=False).str.strip()
vals = vals.replace({"": np.nan, "-": np.nan})
long["manufacturing_production_index"] = pd.to_numeric(vals, errors="coerce")

# Tidy
long["region"] = long["region"].astype(str).str.strip()
if "category" in long.columns:
    long["category"] = long["category"].astype("string")
long["year"] = long["time"].dt.year.astype("Int64")
long["month"] = long["time"].dt.month.astype("Int64")
long["series_name"] = "제조업생산지수(2020=100)"

# Sort
sort_keys = ["region", "time"]
if "category" in long.columns:
    sort_keys = ["region", "category", "time"]
long = long.sort_values(sort_keys).reset_index(drop=True)

out_csv = "/Users/kjh/Documents/geostat3_e9/manufacturing_production_index_long.csv"
long.to_csv(out_csv, index=False, encoding="utf-8-sig")


  warn("Workbook contains no default style, apply openpyxl's default")


In [9]:
os.getcwd()

'/Users/kjh/Documents/geostat3_e9'

In [45]:
# Build a unified labor-force panel with T1, T2, ... metrics as separate columns (wide by metric).
import pandas as pd
import numpy as np
import re
from pathlib import Path

BASE = Path("/Users/kjh/Documents/geostat3_e9/data")
paths = [BASE / f"labor_force_{i}.xlsx" for i in range(1,7)]

# Regexes for KOSIS header patterns
RX_Y = re.compile(r"^Y(?P<code>\d{4})\s+(?P<year>\d{4})(?:\.(?P<sub>\d+))?$")
RX_H = re.compile(r"^H(?P<code>\d{6})\s+(?P<year>\d{4})\.(?P<half>[12])\/2(?:\.(?P<sub>\d+))?$")
RX_Q = re.compile(r"^Q(?P<code>\d{6})\s+(?P<year>\d{4})\.(?P<q>[1-4])\/4(?:\.(?P<sub>\d+))?$")

def parse_time(col):
    s = str(col).strip()
    m = RX_Y.match(s)
    if m:
        return {"year": int(m.group("year")), "month": 12, "freq": "A", "period": 1}
    m = RX_H.match(s)
    if m:
        y = int(m.group("year")); h = int(m.group("half"))
        return {"year": y, "month": 6 if h==1 else 12, "freq":"H", "period": h}
    m = RX_Q.match(s)
    if m:
        y = int(m.group("year")); q = int(m.group("q"))
        return {"year": y, "month": {1:3,2:6,3:9,4:12}[q], "freq":"Q", "period": q}
    return None

def detect_region_col(df):
    for key in ["행정구역", "행정구역별", "시군구", "지역"]:
        hits = [c for c in df.columns if key in str(c)]
        if hits:
            return hits[0]
    return df.columns[0]

def extract_code_name(s):
    s = str(s)
    # Allow 4~5 digits prefix (예: 3101 수원시 / 11110 종로구)
    m5 = re.search(r"(\d{5})", s)
    m4 = re.search(r"(^|\s)(\d{4})(\s|$)", s)
    code = m5.group(1) if m5 else (m4.group(2) if m4 else np.nan)
    # Clean name
    name = re.sub(r"\(\d{4,5}\)", "", s)
    name = re.sub(r"\b\d{4,5}\b", "", name)
    name = name.replace("\u3000", " ")
    name = re.sub(r"\s+", " ", name).strip()
    return pd.Series({"sgg_code": code, "sgg_name": name})

def get_metric_code(label_text):
    s = str(label_text).strip()
    m = re.match(r"^(T\d+)", s)
    return m.group(1) if m else s  # fallback to full label

wide_frames = []
logs = []

for p in paths:
    # Read with header row as row 0; assume sheet "데이터"
    df = pd.read_excel(p, sheet_name="데이터", dtype=str, engine="openpyxl", header=0)
    region_col = detect_region_col(df)
    if df.empty or region_col not in df.columns:
        logs.append({"file": p.name, "status":"EMPTY_OR_NO_REGION"})
        continue
    
    # First row contains metric labels for time columns (KOSIS style)
    label_row = df.iloc[0]
    data = df.iloc[1:].copy()
    
    # Identify time columns by regex
    time_cols = [c for c in data.columns if parse_time(c) is not None]
    if not time_cols:
        logs.append({"file": p.name, "status":"NO_TIME_COLS"})
        continue
    
    # Melt to long with time meta
    long = data.melt(id_vars=[region_col], value_vars=time_cols, var_name="time_col", value_name="value")
    meta = pd.DataFrame([parse_time(c) for c in long["time_col"]])
    long = pd.concat([long, meta], axis=1)
    # Attach metric code from label row
    metric_map = {c: get_metric_code(label_row.get(c, "")) for c in time_cols}
    long["metric"] = long["time_col"].map(metric_map)
    # Clean region
    region_df = long[region_col].apply(extract_code_name)
    long = pd.concat([long, region_df], axis=1)
    # Numeric
    long["value"] = (long["value"].astype(str)
                     .str.replace(",", "", regex=False)
                     .replace({"-": np.nan, "": np.nan}))
    long["value"] = pd.to_numeric(long["value"], errors="coerce")
    # Drop missing essentials
    long = long.dropna(subset=["year","month","value"])
    # Cast
    long["year"] = long["year"].astype("Int64")
    long["month"] = long["month"].astype("Int64")
    long["source_file"] = p.name
    
    # Pivot: metrics into columns (T1, T2, ...)
    wide = (long
            .pivot_table(index=["sgg_code","sgg_name","year","month","freq","period","source_file"],
                         columns="metric", values="value", aggfunc="first")
            .reset_index())
    # Sort metric columns naturally (T1, T2, ... then others)
    metric_cols = [c for c in wide.columns if c not in ["sgg_code","sgg_name","year","month","freq","period","source_file"]]
    def metric_key(c):
        m = re.match(r"^T(\d+)$", str(c))
        return (0, int(m.group(1))) if m else (1, str(c))
    metric_cols_sorted = sorted(metric_cols, key=metric_key)
    wide = wide[["sgg_code","sgg_name","year","month","freq","period","source_file"] + metric_cols_sorted]
    wide_frames.append(wide)
    logs.append({"file": p.name, "status":"OK", "rows": len(wide), "metrics": metric_cols_sorted[:10]})

# Concatenate and unify
panel_wide = pd.concat(wide_frames, ignore_index=True) if wide_frames else pd.DataFrame()

# If the same (sgg, time) appears across files (e.g., overlapping), prefer later files by sorting by source_file then dropping duplicates
panel_wide = panel_wide.sort_values(["sgg_code","sgg_name","year","month","freq","period","source_file"]).drop_duplicates(
    subset=["sgg_code","sgg_name","year","month","freq","period"], keep="last"
)

# Save and preview
out_path = BASE / "labor_force_panel_with_T_metrics.csv"
panel_wide.to_csv(out_path, index=False, encoding="utf-8-sig")

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [46]:
import pandas as pd
from pathlib import Path

BASE = Path("/Users/kjh/Documents/geostat3_e9/data")

in_path  = BASE / "labor_force_panel_with_T_metrics.csv"
out_path = BASE / "labor_force_panel_named.csv"

# 1) 로드
df = pd.read_csv(in_path, dtype={"sgg_code": str})

# 2) T-코드 → 읽기 쉬운 변수명 매핑
t_to_en = {
    "T1": "pop15p",        # 15세 이상 인구 (천명)
    "T2": "labor_force",   # 경제활동인구 (천명)
    "T3": "employed",      # 취업자 (천명)
    "T4": "unemployed",    # 실업자 (천명)
    "T5": "nilf",          # 비경제활동인구 (천명)
    "T6": "lfpr",          # 경제활동참가율 (%)
    "T7": "emp_rate",      # 고용률 (%)
    "T8": "unemp_rate",    # 실업률 (%)
}

# 3) 존재하는 T컬럼만 매핑하여 새 컬럼 생성
present = [c for c in t_to_en if c in df.columns]
for tcol in present:
    df[t_to_en[tcol]] = df[tcol]

# 4) 컬럼 순서 정리 (키 → 새 변수 → 원본 T컬럼 → 기타)
key_cols   = ["sgg_code","sgg_name","year","month","freq","period","source_file"]
named_cols = [t_to_en[c] for c in present]
orig_cols  = [c for c in present]  # 원본 T1..T8 보존하려면 유지, 삭제하려면 아래에서 drop
other_cols = [c for c in df.columns if c not in (key_cols + named_cols + orig_cols)]

ordered = key_cols + named_cols + orig_cols + other_cols
ordered = [c for c in ordered if c in df.columns]  # 안전하게 필터

df_named = df[ordered].copy()

# (옵션) 원본 T컬럼 삭제하려면 다음 두 줄 사용
# df_named = df_named.drop(columns=present, errors="ignore")
# ordered = [c for c in ordered if c not in present]

# 5) 저장
df_named.to_csv(out_path, index=False, encoding="utf-8-sig")
print(f"Saved: {out_path}")


Saved: /Users/kjh/Documents/geostat3_e9/data/labor_force_panel_named.csv


In [37]:
os.getcwd()

'/Users/kjh/Documents/geostat3_e9'

In [7]:
# -*- coding: utf-8 -*-
"""
인구 패널(월/연 데이터 결합) → '시군구' 단위로 재집계 (코드기반 일반구-시 접기)
- '일반시 + 구' (도 소속)만 같은 연-월에서 '시'로 합산
- '특별시/광역시/특별자치시'의 '구'는 그대로 유지
- 입력(그대로 사용):
    /Users/kjh/Documents/geostat3_e9/data/101_DT_1B040A3_20251104000738.csv  (월별)
    /Users/kjh/Documents/geostat3_e9/data/행정구역_시군구_별__성별_인구수_20251104000919.xlsx (연별)
- 중간 산출: population_panel.csv
- 최종 산출: population_panel_sgg_aggregated.csv  (컬럼: sido_name, sigungu, year, month, total, male, female, yyyymm)
"""

import pandas as pd
import numpy as np
import re
from pathlib import Path

BASE = Path("/Users/kjh/Documents/geostat3_e9/data")
CSV_MONTHLY  = BASE / "101_DT_1B040A3_20251104000738.csv"
XLSX_YEARLY  = BASE / "행정구역_시군구_별__성별_인구수_20251104000919.xlsx"

OUT_PANEL    = BASE.parent / "population_panel.csv"
OUT_AGG      = BASE.parent / "population_panel_sgg_aggregated.csv"

# 시도 코드→명 (신코드 포함)
SIDO_MAP = {
    "11":"서울특별시","26":"부산광역시","27":"대구광역시","28":"인천광역시","29":"광주광역시",
    "30":"대전광역시","31":"울산광역시","36":"세종특별자치시","41":"경기도",
    "42":"강원도","43":"충청북도","44":"충청남도","45":"전라북도",
    "46":"전라남도","47":"경상북도","48":"경상남도","50":"제주특별자치도",
    "51":"강원특별자치도","52":"전북특별자치도"
}

# ────────────────────────────────────────────────────────────────
# 0) 월별 원시 CSV에서 '일반시의 구' → 부모 '시'를 코드로 찾는 매핑 구축
# ────────────────────────────────────────────────────────────────
def build_general_city_ward_code_map(csv_path: Path) -> dict:
    """
    반환: { five_digit_sgg_code(str) : parent_city_name(str) }
    예) {"47111":"포항시","47113":"포항시", "41111":"수원시", ...}
    """
    df = pd.read_csv(csv_path, dtype=str, encoding="cp949")
    code_col = "[A]행정구역(시군구)별"
    name_col = "행정구역(시군구)별"

    CITY_WARDS = {
        # 경기도(41)
        "수원시":  ( "41", ["장안구","권선구","팔달구","영통구"] ),
        "성남시":  ( "41", ["수정구","중원구","분당구"] ),
        "안양시":  ( "41", ["만안구","동안구"] ),
        "고양시":  ( "41", ["덕양구","일산동구","일산서구"] ),
        "안산시":  ( "41", ["상록구","단원구"] ),
        "용인시":  ( "41", ["처인구","기흥구","수지구"] ),
        "부천시":  ( "41", ["원미구","소사구","오정구"] ),  # 과거 구
        # 충북(43)
        "청주시":  ( "43", ["상당구","서원구","흥덕구","청원구"] ),
        # 충남(44)
        "천안시":  ( "44", ["동남구","서북구"] ),
        # 전북(45)
        "전주시":  ( "45", ["완산구","덕진구"] ),
        "전주시":  ( "52", ["완산구","덕진구"] ),
        # 경북(47) — ★ 포항시 남구/북구만 대상
        "포항시":  ( "47", ["남구","북구"] ),
        # 경남(48)
        "창원시":  ( "48", ["의창구","성산구","마산합포구","마산회원구","진해구"] ),
    }

    METRO_PREFIX = {"11","26","27","28","29","30","31","36","50"}  # 접지 않음

    mapping = {}
    for city, (sido_prefix, ward_list) in CITY_WARDS.items():
        mask = (
            df[code_col].astype(str).str.startswith(sido_prefix) &
            df[name_col].astype(str).isin(ward_list)
        )
        subset = df.loc[mask, [code_col, name_col]].drop_duplicates()
        subset = subset[~subset[code_col].astype(str).str[:2].isin(METRO_PREFIX)]
        for _, row in subset.iterrows():
            sgg_code = str(row[code_col]).strip()
            if re.fullmatch(r"\d{5}", sgg_code):
                mapping[sgg_code] = city  # 이 5자리 코드는 해당 '시'로 접는다
    return mapping

GENERAL_WARD_CODE_TO_CITY = build_general_city_ward_code_map(CSV_MONTHLY)

# ────────────────────────────────────────────────────────────────
# 1) 월별 CSV (2011~)
# ────────────────────────────────────────────────────────────────
def load_monthly(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path, dtype=str, encoding="cp949")
    code_col = "[A]행정구역(시군구)별"
    name_col = "행정구역(시군구)별"
    itemcode_col  = "[Item]항목"
    itemlabel_col = "항목"

    time_cols = [c for c in df.columns if re.match(r"^20\d{2}\.\d{2}\s*월$", str(c))]
    time_cols = [c for c in time_cols if not str(c).lower().startswith("unnamed")]

    long_m = df.melt(
        id_vars=[code_col, name_col, itemcode_col, itemlabel_col],
        value_vars=time_cols, var_name="time_col", value_name="value"
    )

    ym = long_m["time_col"].str.extract(r"^(?P<year>20\d{2})\.(?P<month>\d{2})")
    long_m["year"]  = ym["year"].astype("Int64")
    long_m["month"] = ym["month"].astype("Int64")

    long_m["value"] = (long_m["value"].astype(str).str.replace(",", "", regex=False)
                       .replace({"-": np.nan, "": np.nan}))
    long_m["value"] = pd.to_numeric(long_m["value"], errors="coerce")

    long_m["sgg_code"] = long_m[code_col].astype(str).str.extract(r"\b(\d{5})\b", expand=False)
    long_m = long_m[long_m["sgg_code"].str.fullmatch(r"\d{5}", na=False)].copy()

    long_m["sgg_name"] = (
        long_m[name_col].astype(str)
        .str.replace(r"\u3000", " ", regex=True)
        .str.replace(r"[\s]*\(?\b\d{5}\)?", "", regex=True)
        .str.strip()
    )

    long_m["sido_code"] = long_m["sgg_code"].str[:2]
    long_m["sido_name"] = long_m["sido_code"].map(SIDO_MAP)

    def sex_from_itemcode(s):
        s = str(s)
        if s.startswith("T20"): return "total"
        if s.startswith("T21"): return "male"
        if s.startswith("T22"): return "female"
        return None

    long_m["sex"] = long_m[itemcode_col].map(sex_from_itemcode)

    monthly_wide = (
        long_m.pivot_table(
            index=["sido_code","sido_name","sgg_code","sgg_name","year","month"],
            columns="sex", values="value", aggfunc="first"
        ).reset_index()
    )
    for c in ["total","male","female"]:
        if c not in monthly_wide.columns:
            monthly_wide[c] = np.nan
    return monthly_wide

# ────────────────────────────────────────────────────────────────
# 2) 연별 XLSX (2008~2010)
# ────────────────────────────────────────────────────────────────
def load_yearly(xlsx_path: Path) -> pd.DataFrame:
    raw_y = pd.read_excel(xlsx_path, dtype=str, engine="openpyxl")
    meta = raw_y.iloc[0]
    df_y = raw_y.iloc[1:].copy()

    region_col = next(c for c in df_y.columns if "행정구역" in str(c))
    df_y[region_col] = df_y[region_col].ffill()

    records = []
    for col in df_y.columns:
        if col == region_col:
            continue
        m = re.search(r"(20\d{2})", str(col))
        if not m:
            continue
        year = int(m.group(1))
        month = 12

        label = str(meta.get(col, ""))
        if   label.startswith("T20"): sex = "total"
        elif label.startswith("T21"): sex = "male"
        elif label.startswith("T22"): sex = "female"
        else:
            if str(col).endswith(".1"):   sex = "male"
            elif str(col).endswith(".2"): sex = "female"
            else:                         sex = "total"

        for _, row in df_y[[region_col, col]].dropna(subset=[col]).iterrows():
            region_raw = str(row[region_col])
            mcode = re.search(r"\b(\d{5})\b", region_raw)
            if not mcode:
                continue
            sgg_code = mcode.group(1)
            sgg_name = re.sub(r"[\s]*\(?\b\d{5}\)?", "", region_raw.replace("\u3000", " ").strip()).strip()

            v = str(row[col]).strip()
            if v in {"", "-"}:
                continue
            try:
                val = float(v.replace(",", ""))
            except:
                continue

            records.append({
                "sido_code": sgg_code[:2],
                "sgg_code":  sgg_code,
                "sgg_name":  sgg_name,
                "year": year,
                "month": month,
                "sex": sex,
                "value": val
            })

    yearly_long = pd.DataFrame.from_records(records)
    yearly_wide = (
        yearly_long.pivot_table(
            index=["sido_code","sgg_code","sgg_name","year","month"],
            columns="sex", values="value", aggfunc="first"
        ).reset_index()
    )
    yearly_wide["sido_name"] = yearly_wide["sido_code"].map(SIDO_MAP)
    return yearly_wide

# ────────────────────────────────────────────────────────────────
# 3) 패널 결합(연+월)
# ────────────────────────────────────────────────────────────────
def build_panel() -> pd.DataFrame:
    monthly_wide = load_monthly(CSV_MONTHLY)
    yearly_wide  = load_yearly(XLSX_YEARLY)
    panel = pd.concat([yearly_wide, monthly_wide], ignore_index=True, sort=False)
    panel["year"]  = pd.to_numeric(panel["year"], errors="coerce").astype("Int64")
    panel["month"] = pd.to_numeric(panel["month"], errors="coerce").astype("Int64")
    for c in ["total","male","female"]:
        if c not in panel.columns:
            panel[c] = np.nan
    panel.loc[panel["sido_code"]=="51", "sido_name"] = "강원특별자치도"
    panel.loc[panel["sido_code"]=="52", "sido_name"] = "전북특별자치도"
    panel = panel.sort_values(["sido_code","sgg_code","year","month"]).reset_index(drop=True)
    return panel

# ────────────────────────────────────────────────────────────────
# 4) 집계 규칙 (분리 저장용)
# ────────────────────────────────────────────────────────────────
SPECIAL_MARKERS = ("특별시","광역시","특별자치시")
CITY_GU_PAT     = re.compile(r"^(?P<city>.+?시)\s+(?P<gu>.+?구)$")

def is_metro_or_special(sido_name: str, sgg_name: str) -> bool:
    s1 = str(sido_name) if pd.notna(sido_name) else ""
    s2 = str(sgg_name)  if pd.notna(sgg_name)  else ""
    return any(mk in s1 for mk in SPECIAL_MARKERS) or any(mk in s2 for mk in SPECIAL_MARKERS)

def collapsed_sigungu_for_output(sido_name: str, sgg_name: str, sgg_code: str) -> str:
    """
    반환 sigungu:
      - 메트로(특별/광역/특별자치시): '구/군'만 (예: '종로구', '남구')
      - 일반시의 구(접기 대상 코드): '시' (예: '포항시', '용인시')
      - '시 구' 패턴이면 '시'
      - 그 외: 원본 sgg_name
    """
    sgg_name = "" if pd.isna(sgg_name) else str(sgg_name)
    if is_metro_or_special(sido_name, sgg_name):
        m = CITY_GU_PAT.match(sgg_name)
        return m.group("gu") if m else sgg_name  # 안전하게 '구'만 남김

    if sgg_code in GENERAL_WARD_CODE_TO_CITY:
        return GENERAL_WARD_CODE_TO_CITY[sgg_code]  # 부모 '시'명

    m = CITY_GU_PAT.match(sgg_name)
    if m:  # 일반시 '시 구' → '시'
        return m.group("city")

    return sgg_name  # 군/시 단독 등

# ────────────────────────────────────────────────────────────────
# 5) 실행
# ────────────────────────────────────────────────────────────────
def main():
    panel = build_panel()
    panel.to_csv(OUT_PANEL, index=False, encoding="utf-8-sig")

    # 출력용 분리 컬럼 생성
    panel["_sido_out"] = panel["sido_name"]
    panel["_sigg_out"] = [
        collapsed_sigungu_for_output(panel.at[i,"sido_name"], panel.at[i,"sgg_name"], str(panel.at[i,"sgg_code"]))
        for i in panel.index
    ]

    agg_cols = ["total","male","female"]
    out = (
        panel.groupby(["_sido_out","_sigg_out","year","month"], dropna=False, as_index=False)[agg_cols]
              .sum(min_count=1)
              .rename(columns={"_sido_out":"sido_name", "_sigg_out":"sigungu"})
              .sort_values(["sido_name","sigungu","year","month"])
              .reset_index(drop=True)
    )

    out["yyyymm"] = (out["year"].astype(int) * 100 + out["month"].astype(int)).astype(int)
    out.to_csv(OUT_AGG, index=False, encoding="utf-8-sig")

    print(f"[OK] population_panel.csv saved: {OUT_PANEL}")
    print(f"[OK] population_panel_sgg_aggregated.csv saved: {OUT_AGG}")
    print(f"rows={len(out):,}, cols={list(out.columns)}")
    # 빠른 확인: 포항 남/북구가 '경상북도-포항시'로 접혔는지
    check = panel[panel["sgg_code"].isin(["47111","47113"])]
    if not check.empty:
        lbl = out[(out["sido_name"]=="경상북도") & (out["sigungu"]=="포항시")].head(1)
        print("[DEBUG] 포항시 예시:", lbl.to_dict(orient="records"))

if __name__ == "__main__":
    main()


[OK] population_panel.csv saved: /Users/kjh/Documents/geostat3_e9/population_panel.csv
[OK] population_panel_sgg_aggregated.csv saved: /Users/kjh/Documents/geostat3_e9/population_panel_sgg_aggregated.csv
rows=41,980, cols=['sido_name', 'sigungu', 'year', 'month', 'total', 'male', 'female', 'yyyymm']
[DEBUG] 포항시 예시: [{'sido_name': '경상북도', 'sigungu': '포항시', 'year': 2008, 'month': 12, 'total': 1016238.0, 'male': 516882.0, 'female': 499356.0, 'yyyymm': 200812}]


### SIDO

In [9]:
import pandas as pd
from pathlib import Path
import unicodedata as ud
import re

# 1) CSV들이 들어 있는 폴더 경로로 수정하세요
BASE_DIR = Path("/Users/kjh/Documents/geostat3_e9/data/MDIS")

# 예: 2009_상반기_지역_20251110_33046.csv / 2010_하반기_지역_20251110_XXXXX.csv
PATTERN = "*20251110_*.csv"

# 2) 헬퍼들 --------------------------------------------------------------
def pick_first_existing(columns, candidates):
    """candidates 중 실제 존재하는 첫 컬럼명 반환"""
    for c in candidates:
        if c in columns:
            return c
    raise KeyError(f"None of {candidates} found in columns: {list(columns)}")

def parse_year_half(path: Path):
    """파일명에서 연도/반기 추출"""
    stem = path.stem
    parts = stem.split("_")
    year = int(re.match(r"\d{4}", parts[0]).group(0))
    half_raw = parts[1]                  # '상반기' or '하반기'
    half = ud.normalize("NFC", half_raw)
    return year, half

# 조사항목 코드 → 열 이름 매핑
ITEM_MAP = {
    "A": "구인인원",
    "B": "채용인원",
    "C": "미충원인원",
    "D": "현원",
    "E": "부족인원",
    "F": "채용계획인원",
}

frames = []

# 3) 파일 루프 -----------------------------------------------------------
for csv_path in sorted(BASE_DIR.glob(PATTERN)):
    year, half = parse_year_half(csv_path)

    # 2009~2025년만
    if not (2009 <= year <= 2025):
        continue

    print(f"Processing: {csv_path.name} (연도={year}, 반기={half})")

    df = pd.read_csv(csv_path, encoding="cp949")

    # 컬럼 자동 매핑
    industry_col = pick_first_existing(df.columns, ["산업대분류", "산업대분류코드"])
    region_col   = pick_first_existing(df.columns, ["지역분류", "지역구분코드"])
    weight_col   = pick_first_existing(df.columns, ["가중값", "가중치"])
    count_col    = pick_first_existing(df.columns, ["인원", "인원수"])
    item_col     = pick_first_existing(df.columns, ["조사항목", "조사항목코드"])

    # 산업대분류 C만
    df = df[df[industry_col] == "C"].copy()

    # 컬럼명 통일
    df = df.rename(columns={region_col: "지역코드", item_col: "조사항목코드"})

    # 연도/반기, 가중인원
    df["연도"] = year
    df["반기"] = half
    df["가중인원"] = df[count_col] * df[weight_col]

    frames.append(df)

if not frames:
    raise RuntimeError("조건(2009~2025, 산업 C)에 맞는 데이터가 없습니다.")

big_df = pd.concat(frames, ignore_index=True)

# 4) 연도 × 반기 × 지역 × 조사항목코드별 합계 (long)
long = (
    big_df
    .groupby(["연도", "반기", "지역코드", "조사항목코드"], as_index=False)["가중인원"]
    .sum()
)

# 코드 → 한글 항목명으로 변환
long["조사항목명"] = long["조사항목코드"].map(ITEM_MAP)

# 5) wide 형태로 pivot: 항목명이 컬럼이 되도록
wide = (
    long
    .pivot_table(
        index=["연도", "반기", "지역코드"],
        columns="조사항목명",
        values="가중인원",
        aggfunc="sum"
    )
    .reset_index()
)

# 컬럼 이름 정리
wide.columns.name = None

# 열 순서 정리 (있다면 그 순서로)
col_order = ["연도", "반기", "지역코드",
             "구인인원", "채용인원", "미충원인원",
             "현원", "부족인원", "채용계획인원"]
wide = wide[[c for c in col_order if c in wide.columns]]

# 결과 저장
out_path = "/Users/kjh/Documents/geostat3_e9/MDIS_panel.csv"
wide.to_csv(out_path, index=False, encoding="utf-8-sig")

print("완료! 결과 저장 위치:", out_path)


Processing: 2009_상반기_지역_20251110_33046.csv (연도=2009, 반기=상반기)
Processing: 2009_하반기_지역_20251110_50224.csv (연도=2009, 반기=하반기)
Processing: 2010_상반기_지역_20251110_33046.csv (연도=2010, 반기=상반기)
Processing: 2010_하반기_지역_20251110_50224.csv (연도=2010, 반기=하반기)
Processing: 2011_상반기_지역_20251110_13148.csv (연도=2011, 반기=상반기)
Processing: 2011_하반기_지역_20251110_10247.csv (연도=2011, 반기=하반기)
Processing: 2012_상반기_지역_20251110_13148.csv (연도=2012, 반기=상반기)
Processing: 2012_하반기_지역_20251110_10247.csv (연도=2012, 반기=하반기)
Processing: 2013_상반기_지역_20251110_13148.csv (연도=2013, 반기=상반기)
Processing: 2013_하반기_지역_20251110_10247.csv (연도=2013, 반기=하반기)
Processing: 2014_상반기_지역_20251110_13148.csv (연도=2014, 반기=상반기)
Processing: 2014_하반기_지역_20251110_10247.csv (연도=2014, 반기=하반기)
Processing: 2015_상반기_지역_20251110_13148.csv (연도=2015, 반기=상반기)
Processing: 2015_하반기_지역_20251110_10247.csv (연도=2015, 반기=하반기)
Processing: 2016_상반기_지역_20251110_