In [None]:
%pip -q install numpy pandas scikit-learn

In [4]:
import pandas as pd

def clean_yn_columns(df: pd.DataFrame, columns: list):
    """
    지정한 컬럼들에 대해
    - 결측치 -> 'N'
    - Y/N -> 1/0
    """
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"컬럼 없음: {col}")

        df[col] = (
            df[col]
            .fillna("N")            # NaN -> N
            .astype(str)
            .str.strip()            # 공백 제거
            .replace({"": "N"})     # 빈 문자열 -> N
            .map({"Y": 1, "N": 0})  # Y/N -> 1/0
        )

    return df


In [None]:
# load
crosswalks = pd.read_csv("crosswalks_passenger.csv")

crosswalk_cols = [
    "highland",
    "signal",
    "button",
    "sound_signal",
    "bump",
    "braille_block",
    "spotlight"
]

crosswalks = clean_yn_columns(crosswalks, crosswalk_cols)

# save
crosswalks.to_csv("crosswalks_passenger.csv", index=False,encoding="utf-8-sig")


In [None]:
# load
signals = pd.read_csv("signals_passenger.csv")

signal_cols = [
    "main_road",
    "button",
    "remain_time",
    "sound_signal"
]

signals = clean_yn_columns(signals, signal_cols)

# save
signals.to_csv("signals_passenger.csv", index=False,encoding="utf-8-sig")


In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.neighbors import BallTree

EARTH_RADIUS_M = 6371000  # meters

# -----------------------------
# 1) CSV 로더 (cp949/utf-8-sig 자동)
# -----------------------------
def read_csv_any(path: str) -> tuple[pd.DataFrame, str]:
    for enc in ("utf-8-sig", "cp949", "euc-kr"):
        try:
            return pd.read_csv(path, encoding=enc), enc
        except Exception:
            pass
    # 마지막 수단
    return pd.read_csv(path, encoding="latin1"), "latin1"

# -----------------------------
# 2) 크로스워크-신호등 매핑 (BallTree + haversine)
# -----------------------------
def build_crosswalk_signal_map(
    signal_df: pd.DataFrame,
    crosswalk_df: pd.DataFrame,
    R_m: float = 30,
    top_k: int = 2,
    use_admin_filter: bool = True,
) -> pd.DataFrame:
    sg = signal_df.copy()
    cw = crosswalk_df.copy()

    # required columns
    req_sg = {"signal_id", "signal_lat", "signal_lon"}
    req_cw = {"crosswalk_id", "crosswalk_lat", "crosswalk_lon"}
    missing = (req_sg - set(sg.columns)) | (req_cw - set(cw.columns))
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    # numeric conversion
    for col in ("signal_lat", "signal_lon"):
        sg[col] = pd.to_numeric(sg[col], errors="coerce")
    for col in ("crosswalk_lat", "crosswalk_lon"):
        cw[col] = pd.to_numeric(cw[col], errors="coerce")

    sg = sg.dropna(subset=["signal_id", "signal_lat", "signal_lon"]).reset_index(drop=True)
    cw = cw.dropna(subset=["crosswalk_id", "crosswalk_lat", "crosswalk_lon"]).reset_index(drop=True)

    if len(sg) == 0 or len(cw) == 0:
        return pd.DataFrame(columns=["crosswalk_id", "signal_id", "distance_m", "confidence", "match_method", "sido", "sigungu"])

    sg_rad = np.deg2rad(sg[["signal_lat", "signal_lon"]].to_numpy())
    cw_rad = np.deg2rad(cw[["crosswalk_lat", "crosswalk_lon"]].to_numpy())

    tree = BallTree(sg_rad, metric="haversine")
    k = min(top_k, len(sg))
    dist_rad, idx = tree.query(cw_rad, k=k)
    dist_m = dist_rad * EARTH_RADIUS_M

    rows = []
    for i in range(len(cw)):
        for j in range(k):
            d = float(dist_m[i, j])
            if d > R_m:
                continue
            srow = sg.iloc[int(idx[i, j])]
            crow = cw.iloc[i]

            if use_admin_filter and all(x in sg.columns for x in ("sido", "sigungu")) and all(x in cw.columns for x in ("sido", "sigungu")):
                if str(srow["sido"]) != str(crow["sido"]) or str(srow["sigungu"]) != str(crow["sigungu"]):
                    continue

            conf = max(0.0, 1.0 - (d / float(R_m)))
            rows.append(
                {
                    "crosswalk_id": crow["crosswalk_id"],
                    "signal_id": srow["signal_id"],
                    "distance_m": round(d, 3),
                    "confidence": round(conf, 6),
                    "sido": crow.get("sido", srow.get("sido", None)),
                    "sigungu": crow.get("sigungu", srow.get("sigungu", None)),
                }
            )

    return pd.DataFrame(rows)


In [3]:
# -----------------------------
# 3) 실행: 필터링 + 저장 + 매핑 생성
# -----------------------------

signals_df, sig_enc = read_csv_any("signals.csv")
crosswalks_df, cw_enc = read_csv_any("crosswalks.csv")
print("loaded encodings:", {"signals": sig_enc, "crosswalks": cw_enc})
print("signals shape:", signals_df.shape, "crosswalks shape:", crosswalks_df.shape)

# (1) signals: signal_type == '02' (보행등)
signals_passenger = signals_df[signals_df["signal_type"].astype(str).str.zfill(2) == "02"].copy()

# (2) crosswalks: signal == 'Y' (보행자신호등 존재)
crosswalks_passenger = crosswalks_df[crosswalks_df["signal"].astype(str).str.strip().str.upper() == "Y"].copy()

# (3) 매핑 생성은 좌표가 필요하므로 "drop 이전" 데이터로 수행
mapping_df = build_crosswalk_signal_map(
    signals_passenger,
    crosswalks_passenger,
    R_m=30,
    top_k=2,
    use_admin_filter=True,
)

# (5) 저장
signals_passenger.to_csv("signals_passenger.csv", index=False, encoding="utf-8-sig")
crosswalks_passenger.to_csv("crosswalks_passenger.csv", index=False, encoding="utf-8-sig")
mapping_df.to_csv("crosswalk_signal_map.csv", index=False, encoding="utf-8-sig")

loaded encodings: {'signals': 'cp949', 'crosswalks': 'cp949'}
signals shape: (50000, 15) crosswalks shape: (50000, 19)
