In [3]:
import pandas as pd

# 1) Load
cw = pd.read_csv("../cw.csv", dtype=str)
sg = pd.read_csv("../sg.csv", dtype=str)
mp = pd.read_csv("../cw_sg_map.csv", dtype=str)

# 숫자 컬럼 변환
for col in ["distance_m", "confidence"]:
    if col in mp.columns:
        mp[col] = pd.to_numeric(mp[col], errors="coerce")

# 2) FK integrity check
cw_ids = set(cw["cw_uid"].dropna().unique()) if "cw_uid" in cw.columns else set(cw["crosswalk_id"].dropna().unique())
sg_ids = set(sg["sg_uid"].dropna().unique()) if "sg_uid" in sg.columns else set(sg["signal_id"].dropna().unique())

# mapping 키 이름 흡수
if "cw_uid" not in mp.columns and "crosswalk_id" in mp.columns:
    mp = mp.rename(columns={"crosswalk_id":"cw_uid"})
if "sg_uid" not in mp.columns and "signal_id" in mp.columns:
    mp = mp.rename(columns={"signal_id":"sg_uid"})

mp["fk_cw_ok"] = mp["cw_uid"].isin(cw_ids)
mp["fk_sg_ok"] = mp["sg_uid"].isin(sg_ids)

fk_bad = mp[~(mp["fk_cw_ok"] & mp["fk_sg_ok"])].copy()

print("=== FK integrity ===")
print("mapping rows:", len(mp))
print("bad FK rows:", len(fk_bad))
print(" - missing cw_uid:", (~mp["fk_cw_ok"]).sum())
print(" - missing sg_uid:", (~mp["fk_sg_ok"]).sum())

# 3) Region mismatch check (join then compare)
# crosswalk join
cw_region_cols = [c for c in ["cw_uid","sido","sigungu"] if c in cw.columns]
sg_region_cols = [c for c in ["sg_uid","sido","sigungu"] if c in sg.columns]

mp_cw = mp.merge(cw[cw_region_cols].rename(columns={"sido":"cw_sido","sigungu":"cw_sigungu"}),
                 on="cw_uid", how="left")
mp_sg = mp.merge(sg[sg_region_cols].rename(columns={"sido":"sg_sido","sigungu":"sg_sigungu"}),
                 on="sg_uid", how="left")

# mapping 자체의 sido/sigungu가 있으면 비교, 없으면 crosswalk 기준으로 채울 수도 있음
if "sido" in mp.columns and "sigungu" in mp.columns:
    mp_cw["cw_region_mismatch"] = (mp_cw["sido"] != mp_cw["cw_sido"]) | (mp_cw["sigungu"] != mp_cw["cw_sigungu"])
    mp_sg["sg_region_mismatch"] = (mp_sg["sido"] != mp_sg["sg_sido"]) | (mp_sg["sigungu"] != mp_sg["sg_sigungu"])
else:
    # mapping에 region이 없으면, mismatch 대신 "원천 region 누락"으로 본다
    mp_cw["cw_region_mismatch"] = False
    mp_sg["sg_region_mismatch"] = False

cw_mis = mp_cw[mp_cw["cw_region_mismatch"]].copy()
sg_mis = mp_sg[mp_sg["sg_region_mismatch"]].copy()

print("\n=== Region mismatch ===")
print("cw mismatch rows:", len(cw_mis))
print("sg mismatch rows:", len(sg_mis))

# 4) distance cutoff policy
# 추천 규칙: <=80m 통과, 80~100m는 confidence>=0.85 통과
DIST_CORE = 80.0
DIST_MAX  = 100.0
CONF_GATE = 0.85

mp["pass_dist_rule"] = False
if "distance_m" in mp.columns:
    core = mp["distance_m"] <= DIST_CORE
    band = (mp["distance_m"] > DIST_CORE) & (mp["distance_m"] <= DIST_MAX)

    if "confidence" in mp.columns:
        mp.loc[core, "pass_dist_rule"] = True
        mp.loc[band & (mp["confidence"] >= CONF_GATE), "pass_dist_rule"] = True
    else:
        # confidence가 없으면 보수적으로 core만 통과
        mp.loc[core, "pass_dist_rule"] = True

dist_bad = mp[~mp["pass_dist_rule"]].copy()

print("\n=== Distance rule ===")
print("fail distance-rule rows:", len(dist_bad))

# 5) 최종 통과 row: FK OK + distance rule + (옵션) region mismatch 제거 여부 선택
# 여기서는 mismatch는 '리포트만' 하고 기본 통과에는 포함 (원하면 필터링 가능)
final = mp[(mp["fk_cw_ok"] & mp["fk_sg_ok"] & mp["pass_dist_rule"])].copy()

# 6) 리포트 저장
fk_bad.to_csv("report_fk_bad.csv", index=False, encoding="utf-8-sig")
cw_mis.to_csv("report_region_mismatch_cw.csv", index=False, encoding="utf-8-sig")
sg_mis.to_csv("report_region_mismatch_sg.csv", index=False, encoding="utf-8-sig")
dist_bad.to_csv("report_distance_filtered.csv", index=False, encoding="utf-8-sig")

final.to_csv("crosswalk_signal_mapping_validated.csv", index=False, encoding="utf-8-sig")

print("\nSaved:")
print("- report_fk_bad.csv")
print("- report_region_mismatch_cw.csv")
print("- report_region_mismatch_sg.csv")
print("- report_distance_filtered.csv")
print("- crosswalk_signal_mapping_validated.csv")


=== FK integrity ===
mapping rows: 399428
bad FK rows: 0
 - missing cw_uid: 0
 - missing sg_uid: 0

=== Region mismatch ===
cw mismatch rows: 0
sg mismatch rows: 0

=== Distance rule ===
fail distance-rule rows: 0

Saved:
- report_fk_bad.csv
- report_region_mismatch_cw.csv
- report_region_mismatch_sg.csv
- report_distance_filtered.csv
- crosswalk_signal_mapping_validated.csv
