In [5]:
import re
import numpy as np
import pandas as pd

IN_PATH  = "./seoul_cw.csv"
DIST_PATH = "./district_all.csv"
OUT_PATH = "./seoul_cw_new.csv"

# crosswalks_passenger_unique.csv 컬럼(스키마)과 동일하게 맞춤
TARGET_COLS = [
    "sido","sigungu","address","crosswalk_type","highland",
    "crosswalk_lat","crosswalk_lon","roadnum","crosswalk_width","crosswalk_length",
    "signal","button","sound_signal","bump","braille_block","spotlight","org_code","cw_uid"
]

def parse_linestring_wkt(wkt: str):
    """
    'LINESTRING(lon lat, lon lat, ...)' -> [(lon, lat), (lon, lat), ...]
    파싱 실패 시 [] 반환
    """
    if pd.isna(wkt):
        return []
    s = str(wkt).strip()

    # LINESTRING(...) 내부만 추출
    m = re.search(r"LINESTRING\s*\(\s*(.+?)\s*\)\s*$", s, flags=re.IGNORECASE)
    if not m:
        return []

    body = m.group(1)
    parts = [p.strip() for p in body.split(",") if p.strip()]
    coords = []
    for p in parts:
        # lon lat 형태(공백 여러개 가능)
        xy = re.split(r"\s+", p)
        if len(xy) < 2:
            continue
        try:
            lon = float(xy[0])
            lat = float(xy[1])
            coords.append((lon, lat))
        except ValueError:
            continue
    return coords

# 1) 원본 로드
df = pd.read_csv(IN_PATH, encoding="utf-8-sig")

# 2) WKT -> 좌표 리스트
df["__coords"] = df["링크 WKT"].apply(parse_linestring_wkt)

# 3) 좌표를 행으로 explode (각 좌표쌍이 1행)
df_exp = df.explode("__coords", ignore_index=True)

# 4) lon/lat 분리
df_exp["crosswalk_lon"] = df_exp["__coords"].apply(lambda t: t[0] if isinstance(t, tuple) else np.nan)
df_exp["crosswalk_lat"] = df_exp["__coords"].apply(lambda t: t[1] if isinstance(t, tuple) else np.nan)

# 좌표 없는 행 제거
df_exp = df_exp.dropna(subset=["crosswalk_lon","crosswalk_lat"]).reset_index(drop=True)

# 5) 스키마 맞추기: 없는 컬럼은 기본값/결측으로 채움
out = pd.DataFrame()
out["sido"] = df_exp.get("sido")
out["sigungu"] = df_exp.get("sigungu")

# district 데이터 로드
district = pd.read_csv(DIST_PATH, encoding="cp949")

# district_id 기준으로 병합
df_exp = df_exp.merge(
    district[["district_code", "district_name"]],
    left_on="address",      # df_exp 쪽: district_id가 들어있는 컬럼
    right_on="district_code", # district_all 쪽
    how="left"
)

# address 컬럼에 district_name 넣기
out["address"] = df_exp["district_name"]

# 나머지 필드 기본값(필요하면 너 프로젝트 규칙에 맞춰 조정)
out["crosswalk_type"] = 1
out["highland"] = 0
out["crosswalk_lat"] = df_exp["crosswalk_lat"].astype(float)
out["crosswalk_lon"] = df_exp["crosswalk_lon"].astype(float)

out["roadnum"] = np.nan
out["crosswalk_width"] = np.nan
out["crosswalk_length"] = np.nan

out["signal"] = np.nan
out["button"] = np.nan
out["sound_signal"] = np.nan
out["bump"] = np.nan
out["braille_block"] = np.nan
out["spotlight"] = np.nan

# org_code가 없으니 임시로 0 (또는 NaN). 행정기관코드를 따로 매핑 가능하면 그때 채우면 됨.
out["org_code"] = np.nan

# 6) cw_uid 만들기: 서울 데이터 전용 prefix + 1부터 순번
out["cw_uid"] = [f"CW_SEOUL_{i:06d}" for i in range(1, len(out) + 1)]

# 7) 컬럼 순서 고정 + 저장
out = out[TARGET_COLS]
out.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")

print("rows:", len(out))
print("saved:", OUT_PATH)
out.head(5)

rows: 698061
saved: ./seoul_cw_new.csv


Unnamed: 0,sido,sigungu,address,crosswalk_type,highland,crosswalk_lat,crosswalk_lon,roadnum,crosswalk_width,crosswalk_length,signal,button,sound_signal,bump,braille_block,spotlight,org_code,cw_uid
0,서울특별시,종로구,서울특별시 종로구 효자동,1,0,37.583205,126.970231,,,,,,,,,,,CW_SEOUL_000001
1,서울특별시,종로구,서울특별시 종로구 효자동,1,0,37.583137,126.970243,,,,,,,,,,,CW_SEOUL_000002
2,서울특별시,종로구,서울특별시 종로구 부암동,1,0,37.590454,126.962318,,,,,,,,,,,CW_SEOUL_000003
3,서울특별시,종로구,서울특별시 종로구 부암동,1,0,37.590421,126.962096,,,,,,,,,,,CW_SEOUL_000004
4,서울특별시,종로구,서울특별시 종로구 창신동,1,0,37.572635,127.013921,,,,,,,,,,,CW_SEOUL_000005
