In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

# =========================
# 설정
# =========================
CROSSWALK_CSV = "crosswalks_passenger_unique.csv"
SIGNAL_CSV    = "signals_passenger_unique.csv"
OUT_CSV       = "crosswalk_signal_map_within_100m.csv"
RADIUS_M      = 100.0

# (프로젝트 업로드 파일을 바로 쓰려면 아래처럼 바꿔서 실행)
# CROSSWALK_CSV = "/mnt/data/crosswalks_passenger_unique.csv"
# SIGNAL_CSV    = "/mnt/data/signals_passenger_unique.csv"

# =========================
# 1) 로드
# =========================
cw = pd.read_csv(CROSSWALK_CSV, encoding="utf-8")
sg = pd.read_csv(SIGNAL_CSV, encoding="utf-8")

# =========================
# 2) 컬럼 추론 (가능한 이름들 자동 대응)
# =========================
def pick_col(df, candidates, label):
    for c in candidates:
        if c in df.columns:
            return c
    raise KeyError(f"{label} 컬럼을 찾지 못함. 후보={candidates}, 현재 컬럼={list(df.columns)[:30]}...")

cw_lat = pick_col(cw, ["crosswalk_lat", "cw_lat", "lat", "latitude"], "crosswalk_lat")
cw_lon = pick_col(cw, ["crosswalk_lon", "cw_lon", "lon", "longitude"], "crosswalk_lon")
sg_lat = pick_col(sg, ["signal_lat", "sg_lat", "lat", "latitude"], "signal_lat")
sg_lon = pick_col(sg, ["signal_lon", "sg_lon", "lon", "longitude"], "signal_lon")

cw_id  = next((c for c in ["cw_uid", "crosswalk_id", "id"] if c in cw.columns), None)
sg_id  = next((c for c in ["sg_uid", "signal_id", "id"] if c in sg.columns), None)

# sido/sigungu는 cw에 있으면 그걸 기준으로 (없으면 sg에서라도 가져오게 fallback)
cw_sido   = "sido" if "sido" in cw.columns else None
cw_sigungu= "sigungu" if "sigungu" in cw.columns else None
sg_sido   = "sido" if "sido" in sg.columns else None
sg_sigungu= "sigungu" if "sigungu" in sg.columns else None

# =========================
# 3) 숫자형/결측 정리
# =========================
cw[cw_lat] = pd.to_numeric(cw[cw_lat], errors="coerce")
cw[cw_lon] = pd.to_numeric(cw[cw_lon], errors="coerce")
sg[sg_lat] = pd.to_numeric(sg[sg_lat], errors="coerce")
sg[sg_lon] = pd.to_numeric(sg[sg_lon], errors="coerce")

cw = cw.dropna(subset=[cw_lat, cw_lon]).reset_index(drop=True)
sg = sg.dropna(subset=[sg_lat, sg_lon]).reset_index(drop=True)

# =========================
# 4) BallTree(Haversine)로 반경 검색
# =========================
R = 6371000.0  # meters
cw_rad = np.deg2rad(cw[[cw_lat, cw_lon]].to_numpy())
sg_rad = np.deg2rad(sg[[sg_lat, sg_lon]].to_numpy())

tree = BallTree(sg_rad, metric="haversine")
ind, dist = tree.query_radius(cw_rad, r=RADIUS_M / R, return_distance=True)

# =========================
# 5) 쌍 생성 (cw_idx, sg_idx, distance_m)
# =========================
rows = []
for i, (nbrs, dists_rad) in enumerate(zip(ind, dist)):
    if len(nbrs) == 0:
        continue
    dists_m = dists_rad * R
    for j, dm in zip(nbrs, dists_m):
        rows.append((i, int(j), float(dm)))

pairs = pd.DataFrame(rows, columns=["cw_idx", "sg_idx", "distance_m"])

# confidence: 가까울수록 1 (0~1 clip)
pairs["confidence"] = (1.0 - pairs["distance_m"] / RADIUS_M).clip(0, 1)

# =========================
# 6) 필요한 정보 붙이기
# =========================
# cw 정보
cw_keep_cols = []
if cw_id: cw_keep_cols.append(cw_id)
if cw_sido: cw_keep_cols.append(cw_sido)
if cw_sigungu: cw_keep_cols.append(cw_sigungu)

cw_small = cw[cw_keep_cols].copy() if cw_keep_cols else pd.DataFrame(index=cw.index)

# sg 정보
sg_keep_cols = []
if sg_id: sg_keep_cols.append(sg_id)
if sg_sido: sg_keep_cols.append(sg_sido)
if sg_sigungu: sg_keep_cols.append(sg_sigungu)

sg_small = sg[sg_keep_cols].copy() if sg_keep_cols else pd.DataFrame(index=sg.index)
sg_small = sg_small.add_prefix("sg_")  # 충돌 방지

pairs = pairs.merge(cw_small, left_on="cw_idx", right_index=True, how="left")
pairs = pairs.merge(sg_small, left_on="sg_idx", right_index=True, how="left")

# =========================
# 7) 출력 컬럼 정리 (네가 말한 헤더 형태로 최대한 맞추기)
# distance_m confidence sido sigungu cw_uid sg_uid
# =========================
out = pd.DataFrame()
out["distance_m"] = pairs["distance_m"]
out["confidence"] = pairs["confidence"]

# sido/sigungu: cw 우선, 없으면 sg에서
if cw_sido and cw_sido in pairs.columns:
    out["sido"] = pairs[cw_sido]
elif "sg_sido" in pairs.columns:
    out["sido"] = pairs["sg_sido"]

if cw_sigungu and cw_sigungu in pairs.columns:
    out["sigungu"] = pairs[cw_sigungu]
elif "sg_sigungu" in pairs.columns:
    out["sigungu"] = pairs["sg_sigungu"]

# IDs
if cw_id and cw_id in pairs.columns:
    out["cw_uid"] = pairs[cw_id]
else:
    out["cw_uid"] = pairs["cw_idx"]  # fallback

if sg_id and ("sg_" + sg_id) in pairs.columns:
    out["sg_uid"] = pairs["sg_" + sg_id]
else:
    out["sg_uid"] = pairs["sg_idx"]  # fallback

# 정렬: 같은 cw 안에서 가까운 순
out = out.sort_values(["cw_uid", "distance_m"], kind="mergesort").reset_index(drop=True)

# 저장
out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

print(f"DONE: {OUT_CSV}")
print(out.head(10))


DONE: crosswalk_signal_map_within_100m.csv
   distance_m  confidence   sido sigungu     cw_uid     sg_uid
0   26.157342    0.738427  대전광역시     대덕구  CW_000073  SG_006027
1   33.321821    0.666782  대전광역시     대덕구  CW_000073  SG_006022
2   34.099030    0.659010  대전광역시     대덕구  CW_000073  SG_005983
3   49.049469    0.509505  대전광역시     대덕구  CW_000073  SG_006028
4   65.181358    0.348186  대전광역시     대덕구  CW_000074  SG_005480
5   69.436622    0.305634  대전광역시     대덕구  CW_000074  SG_005479
6    4.410116    0.955899  대전광역시     대덕구  CW_000076  SG_005480
7    4.884021    0.951160  대전광역시     대덕구  CW_000076  SG_005479
8   98.397411    0.016026  대전광역시     대덕구  CW_000077  SG_005480
9   99.885369    0.001146  대전광역시     대덕구  CW_000077  SG_005479
