<a href="https://colab.research.google.com/github/janghjun/changwon_data_project/blob/main/%EC%B0%BD%EC%9B%90%EC%8B%9C_%EA%B3%B5%EA%B3%B5%EB%8D%B0%EC%9D%B4%ED%84%B0_%EA%B3%B5%EB%AA%A8%EC%A0%84_%EC%9E%A5%ED%98%95%EC%A4%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 수집을 위한 과정 모음
- API키셋팅
- STEP별 데이터 수집
- 추후 정리 희망

## API키셋 .env로 저장 및 로드하기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os, textwrap

# 비밀폴더 경로(원하면 변경 가능)
SECRET_DIR = "/content/drive/MyDrive/.secrets/changwon"
os.makedirs(SECRET_DIR, exist_ok=True)
print("Secret dir:", SECRET_DIR)

Mounted at /content/drive
Secret dir: /content/drive/MyDrive/.secrets/changwon


In [2]:
# 5cWyKWHQGpmhdcKtwApnKT2BrSjLht330TRzM7cFG42eWPEKdkB7b1Z04QGpmSYWUt5T3cOjG6dHSj9V9LH6JQ

ENC_KEY = "5cWyKWHQGpmhdcKtwApnKT2BrSjLht330TRzM7cFG42eWPEKdkB7b1Z04QGpmSYWUt5T3cOjG6dHSj9V9LH6JQ%3D%3D"

ENV_PATH = f"{SECRET_DIR}/.env"
with open(ENV_PATH, "w") as f:
    f.write(textwrap.dedent(f"""
    # 창원 BIS API (URL-Encoded)
    CHANGWON_KEY_URLENC={ENC_KEY.strip()}
    """).strip()+"\n")

print("Wrote:", ENV_PATH)

Wrote: /content/drive/MyDrive/.secrets/changwon/.env


In [3]:
!pip -q install python-dotenv

import os
from dotenv import load_dotenv

loaded = load_dotenv(ENV_PATH)  # 특정 경로의 .env
print("dotenv loaded:", loaded)

KEY = os.getenv("CHANGWON_KEY_URLENC")
print("KEY loaded:", bool(KEY))
print("KEY preview (앞 6자/뒤 6자만):", (KEY[:6] + "..." + KEY[-6:]) if KEY else None)

dotenv loaded: True
KEY loaded: True
KEY preview (앞 6자/뒤 6자만): 5cWyKW...%3D%3D


In [4]:
import urllib.parse as up
decoded = up.unquote(KEY) if KEY else ""
print("Decoded endswith '==' ?", decoded.endswith("=="))
assert decoded.endswith("=="), "인코딩 키가 아닌 값이거나 잘못 복사된 것 같아요."
print("OK: .env → 환경변수 로드 성공")

Decoded endswith '==' ? True
OK: .env → 환경변수 로드 성공


## .env파일 세션에서 바로 가져오기
- 추후 .gitignore까지 추가하여 깃레퍼지토리에 기록하기!

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
from dotenv import load_dotenv

SECRET_DIR = "/content/drive/MyDrive/.secrets/changwon"
ENV_PATH = f"{SECRET_DIR}/.env"

# 이미 설치되어 있다면 생략 가능
!pip -q install python-dotenv

assert os.path.exists(ENV_PATH), f".env가 없어요: {ENV_PATH}"
load_dotenv(ENV_PATH)

import urllib.parse as up
KEY = os.getenv("CHANGWON_KEY_URLENC")
print("KEY loaded:", bool(KEY))
print("Decoded OK?", up.unquote(KEY).endswith("=="))

Mounted at /content/drive
KEY loaded: True
Decoded OK? True


## Step 2. TAGO API 준비

### Step 2-1. 공통 준비 (키 로드)

In [3]:
import os, requests, pandas as pd
from dotenv import load_dotenv
from urllib.parse import unquote

load_dotenv("/content/drive/MyDrive/.secrets/changwon/.env")
KEY = os.getenv("CHANGWON_KEY_URLENC")
assert KEY and unquote(KEY).endswith("==")
BASE = "https://apis.data.go.kr/1613000"
OUT = "/content/drive/MyDrive/data_j/raw/tago"
os.makedirs(OUT, exist_ok=True)

### Step 2-2. 창원 cityCode 자동 조회

In [4]:
import os, requests, json, time
from urllib.parse import unquote

BASES = ["https://apis.data.go.kr/1613000", "http://apis.data.go.kr/1613000"]
URL   = "/BusSttnInfoInqireService/getCtyCodeList"

# 1) .env에서 불러온 (Encoding) 키
ENC_KEY = os.environ["CHANGWON_KEY_URLENC"]
print("ENC_KEY preview:", ENC_KEY[:6], "...", ENC_KEY[-6:])

# 2) Decoding 키 (requests의 params가 다시 인코딩하므로 이 값을 쓰는 게 안전)
DEC_KEY = unquote(ENC_KEY)
print("DEC_KEY endswith '=='? ->", DEC_KEY.endswith("=="))

def try_call(base, key, label):
    url = f"{base}{URL}"
    try:
        r = requests.get(url, params={"serviceKey": key, "_type":"json"}, timeout=15)
        print(f"\n[{label}] {url} -> status {r.status_code}, len {len(r.content)}")
        # JSON 파싱 시도 (XML이면 타입 바꾸거나 응답 검사)
        data = r.json()
        # 최소 구조 확인
        items = data["response"]["body"]["items"]["item"]
        print(f"items count ~> {len(items) if isinstance(items, list) else 1}")
        return data
    except Exception as e:
        print(f"[{label}] FAIL:", repr(e))
        return None

# 우선순위: (HTTPS, DEC_KEY) -> (HTTP, DEC_KEY) -> (HTTPS, ENC_KEY 직접쿼리) -> (HTTP, ENC_KEY 직접쿼리)
data = None
order = [
    (BASES[0], DEC_KEY, "HTTPS + DEC_KEY(params)"),
    (BASES[1], DEC_KEY, "HTTP  + DEC_KEY(params)"),
]

for base, key, label in order:
    data = try_call(base, key, label)
    if data:
        break

if not data:
    # 최후 수단: serviceKey를 쿼리스트링에 직접 붙여서 (재인코딩 회피)
    for base in BASES:
        url = f"{base}{URL}?serviceKey={ENC_KEY}&_type=json"
        try:
            r = requests.get(url, timeout=15)
            print(f"\n[DIRECT URL] {url[:80]}... -> status {r.status_code}, len {len(r.content)}")
            data = r.json()
            items = data["response"]["body"]["items"]["item"]
            print(f"items count ~> {len(items) if isinstance(items, list) else 1}")
            break
        except Exception as e:
            print("[DIRECT URL] FAIL:", repr(e))

if not data:
    raise SystemExit("모든 조합 실패. 네트워크/키/계정 상태를 다시 확인해주세요.")
else:
    # '창원' 코드 뽑기
    items = data["response"]["body"]["items"]["item"]
    if not isinstance(items, list):
        items = [items]
    city = [it for it in items if "창원" in it.get("cityname","")][0]
    CITY_CODE = str(city["citycode"])
    print("\n성공 CITY_CODE =", CITY_CODE, "| cityname:", city["cityname"])

ENC_KEY preview: 5cWyKW ... %3D%3D
DEC_KEY endswith '=='? -> True
[HTTPS + DEC_KEY(params)] FAIL: SSLError(MaxRetryError("HTTPSConnectionPool(host='apis.data.go.kr', port=443): Max retries exceeded with url: /1613000/BusSttnInfoInqireService/getCtyCodeList?serviceKey=5cWyKWHQGpmhdcKtwApnKT2BrSjLht330TRzM7cFG42eWPEKdkB7b1Z04QGpmSYWUt5T3cOjG6dHSj9V9LH6JQ%3D%3D&_type=json (Caused by SSLError(SSLError(1, '[SSL: SSLV3_ALERT_ILLEGAL_PARAMETER] sslv3 alert illegal parameter (_ssl.c:1010)')))"))

[HTTP  + DEC_KEY(params)] http://apis.data.go.kr/1613000/BusSttnInfoInqireService/getCtyCodeList -> status 200, len 5694
items count ~> 132

성공 CITY_CODE = 38010 | cityname: 창원시


### TAGO 정적 덤프 생성 (stops/routes/route_stops)

In [9]:
# 공통 세팅 (HTTP + DEC_KEY)
import os, requests, pandas as pd, numpy as np
from dotenv import load_dotenv
from urllib.parse import unquote

# .env 로드
load_dotenv("/content/drive/MyDrive/.secrets/changwon/.env")
ENC_KEY = os.getenv("CHANGWON_KEY_URLENC")
DEC_KEY = unquote(ENC_KEY)  # requests params 재인코딩 방지
BASE = "http://apis.data.go.kr/1613000"  # 이번 세션은 HTTP가 정상
OUT  = "/content/drive/MyDrive/data_j/raw/tago"
os.makedirs(OUT, exist_ok=True)

CITY_CODE = "38010"  # 앞에서 구한 값(창원시)
print("BASE:", BASE, "| CITY_CODE:", CITY_CODE)

BASE: http://apis.data.go.kr/1613000 | CITY_CODE: 38010


In [10]:
#안전 파셔 및 정류소 스캔
import os, time, requests, pandas as pd, numpy as np
from urllib.parse import unquote

BASE = "http://apis.data.go.kr/1613000"   # 이번 세션은 HTTP가 정상
ENC_KEY = os.getenv("CHANGWON_KEY_URLENC")
DEC_KEY = unquote(ENC_KEY)
CITY_CODE = "38010"  # 앞에서 구한 창원시 코드

def extract_items(js):
    """
    TAGO 공통응답에서 items를 안전하게 꺼낸다.
    - body.items 가 ""(빈문자)인 경우 → []
    - body.items.item 이 dict → [dict]
    - body.items.item 이 list → list
    """
    try:
        header = js["response"]["header"]
        if str(header.get("resultCode")) != "00":
            # 정상코드 외엔 빈 리스트 반환
            return []
        body = js["response"]["body"]
    except Exception:
        return []

    items = body.get("items", None)
    if items in (None, "", []):
        return []
    if isinstance(items, list):
        return items
    if isinstance(items, dict):
        it = items.get("item", None)
        if it is None:
            return []
        if isinstance(it, list):
            return it
        return [it]
    # 그 외 타입 방어
    return []

def get_nearby_stops(lat, lon):
    url = f"{BASE}/BusSttnInfoInqireService/getCrdntPrxmtSttnList"
    p = {"serviceKey": DEC_KEY, "_type": "json", "gpsLati": lat, "gpsLong": lon}
    r = requests.get(url, params=p, timeout=20)
    js = r.json()
    return extract_items(js)

# 🔎 처음엔 트래픽을 줄여 “작은 영역”만 테스트하고, 정상 확인 후 범위를 넓혀가자.
# 창원 대략 영역(점검용): 간격 0.02도(≈2.2km)로 성능 점검 → OK면 0.01로 촘촘히
lats = np.arange(35.16, 35.30, 0.02)
lons = np.arange(128.55,128.78, 0.02)

seen, rows = set(), []
for la in lats:
    for lo in lons:
        try:
            for it in get_nearby_stops(la, lo):
                nid = str(it.get("nodeid", "")).strip()
                if nid and nid not in seen:
                    seen.add(nid)
                    rows.append(it)
            time.sleep(0.15)   # 매너 타임(트래픽/차단 예방)
        except Exception as e:
            # 일시 오류는 건너뛰고 계속
            pass

stops_df = pd.DataFrame(rows)
OUT = "/content/drive/MyDrive/data_j/raw/tago"
os.makedirs(OUT, exist_ok=True)
stops_df.to_csv(f"{OUT}/stops_tago.csv", index=False)
print("stops_tago.csv 저장:", stops_df.shape)
stops_df.head(5)

stops_tago.csv 저장: (302, 6)


Unnamed: 0,citycode,gpslati,gpslong,nodeid,nodenm,nodeno
0,38010,35.157436,128.552931,CWB379001513,우산마을,306613
1,38010,35.158323,128.553851,CWB379001508,우산마을,306612
2,38010,35.158363,128.610666,CWB379001503,석교종점,207802
3,38010,35.158206,128.610603,CWB379001505,석교종점,207801
4,38010,35.163892,128.608183,CWB379001474,갯마을,207803


In [13]:
import pandas as pd, os

OUT = "/content/drive/MyDrive/data_j/raw/tago"
df = pd.read_csv(f"{OUT}/stops_tago.csv", dtype=str)
# UTF-8 with BOM (권장)
df.to_csv(f"{OUT}/stops_tago_utf8sig.csv", index=False, encoding="utf-8-sig")

In [14]:
import pandas as pd, os

OUT = "/content/drive/MyDrive/data_j/raw/tago"
df = pd.read_csv(f"{OUT}/stops_tago.csv", dtype=str)

# 한글 몇 개만 확인
df[["nodeid","nodenm"]].head(10)

Unnamed: 0,nodeid,nodenm
0,CWB379001513,우산마을
1,CWB379001508,우산마을
2,CWB379001503,석교종점
3,CWB379001505,석교종점
4,CWB379001474,갯마을
5,CWB379001473,갯마을
6,CWB379001475,귀산
7,CWB379003181,귀산
8,CWB379001487,진해남중
9,CWB379001489,진해남중


### 공통 준비(안전 파서 재사용 + 출력 경로)

In [5]:
import os, time, requests, pandas as pd
from urllib.parse import unquote

BASE = "http://apis.data.go.kr/1613000"     # 이번 세션은 HTTP가 정상
ENC_KEY = os.getenv("CHANGWON_KEY_URLENC")
DEC_KEY = unquote(ENC_KEY)                  # params 재인코딩 방지
CITY_CODE = "38010"                         # 창원시
OUT = "/content/drive/MyDrive/data_j/raw/tago"
os.makedirs(OUT, exist_ok=True)

def extract_items(js):
    """
    TAGO 공통 응답에서 items를 안전하게 꺼낸다.
    - body.items = "" 인 경우 → []
    - body.items.item 이 dict → [dict]
    - body.items.item 이 list → list
    """
    try:
        if str(js["response"]["header"].get("resultCode")) != "00":
            return []
        body = js["response"]["body"]
    except Exception:
        return []
    items = body.get("items", None)
    if items in (None, "", []): return []
    if isinstance(items, list):  return items
    if isinstance(items, dict):
        it = items.get("item", None)
        if it is None: return []
        return it if isinstance(it, list) else [it]
    return []

### 노선 목록 덤프(routes)

In [6]:
def get_route_nolist(city_code):
    url = f"{BASE}/BusRouteInfoInqireService/getRouteNoList"
    p = {"serviceKey": DEC_KEY, "_type":"json", "cityCode": city_code}
    r = requests.get(url, params=p, timeout=30)
    return extract_items(r.json())

routes = get_route_nolist(CITY_CODE)
routes_df = pd.DataFrame(routes)

# 저장: 분석용(utf-8) + 엑셀용(utf-8-sig)
routes_df.to_csv(f"{OUT}/routes_tago.csv", index=False)
routes_df.to_csv(f"{OUT}/routes_tago_utf8sig.csv", index=False, encoding="utf-8-sig")

print("routes_tago.csv 저장:", routes_df.shape)
display(routes_df.head(5))

routes_tago.csv 저장: (10, 7)


Unnamed: 0,endnodenm,endvehicletime,routeid,routeno,routetp,startnodenm,startvehicletime
0,창원역,2210,CWB379000010,1,마을버스,신전,540
1,창원역,2145,CWB379000020,2,마을버스,유등,720
2,진영시외주차장,1655,CWB379000030,3,마을버스,상리,755
3,자여입구,2300,CWB379000070,7,마을버스,창원역,617
4,삼성창원병원종점(8),2240,CWB379000080,8,마을버스,창원역,600


### 노선별 경유정류소 덤프(route_stops)

In [7]:
def get_route_stops(city_code, route_id):
    url = f"{BASE}/BusRouteInfoInqireService/getRouteAcctoThrghSttnList"
    p = {"serviceKey": DEC_KEY, "_type":"json", "cityCode": city_code, "routeId": route_id}
    r = requests.get(url, params=p, timeout=30)
    return extract_items(r.json())

rs_rows = []
for rid in pd.Series(routes_df.get("routeid")).dropna().astype(str).unique():
    try:
        rs_rows += get_route_stops(CITY_CODE, rid)
    except Exception:
        pass
    time.sleep(0.05)  # 매너 대기

route_stops_df = pd.DataFrame(rs_rows)

# 저장: 분석용 + 엑셀용
route_stops_df.to_csv(f"{OUT}/route_stops_tago.csv", index=False)
route_stops_df.to_csv(f"{OUT}/route_stops_tago_utf8sig.csv", index=False, encoding="utf-8-sig")

print("route_stops_tago.csv 저장:", route_stops_df.shape)
display(route_stops_df.head(5))

route_stops_tago.csv 저장: (100, 8)


Unnamed: 0,gpslati,gpslong,nodeid,nodenm,nodeno,nodeord,routeid,updowncd
0,35.367096,128.674078,CWB379003142,신전,103719,1,CWB379000010,0
1,35.36602,128.677797,CWB379000045,상리,100702,2,CWB379000010,0
2,35.366533,128.680975,CWB379000041,갈전운동장,100703,3,CWB379000010,0
3,35.367938,128.684083,CWB379000022,일동회관,100705,4,CWB379000010,0
4,35.367864,128.687909,CWB379000027,일동초등학교,103707,5,CWB379000010,0


### 표준 컬럼명 리네임(+ 엑셀용 함께 저장)

In [15]:
# 정류소(full)은 앞 단계에서 만든 stops_tago.csv 사용
stops_full = pd.read_csv(f"{OUT}/stops_tago.csv", dtype=str)

stops_std = stops_full.rename(columns={
    "nodeid":"station_id", "nodenm":"station_name",
    "gpslati":"lat", "gpslong":"lon"
})
routes_std = routes_df.rename(columns={
    "routeid":"route_id", "routeno":"route_num", "routetp":"route_tp"
})
route_stops_std = route_stops_df.rename(columns={
    "routeid":"route_id", "nodeid":"station_id", "seq":"station_ord"
})

# 분석용 저장(utf-8)
stops_std.to_csv(f"{OUT}/stops_std.csv", index=False)
routes_std.to_csv(f"{OUT}/routes_std.csv", index=False)
route_stops_std.to_csv(f"{OUT}/route_stops_std.csv", index=False)

# 엑셀 공유용 저장(utf-8-sig)
stops_std.to_csv(f"{OUT}/stops_std_utf8sig.csv", index=False, encoding="utf-8-sig")
routes_std.to_csv(f"{OUT}/routes_std_utf8sig.csv", index=False, encoding="utf-8-sig")
route_stops_std.to_csv(f"{OUT}/route_stops_std_utf8sig.csv", index=False, encoding="utf-8-sig")

print("표준 저장 완료")
display(stops_std.head(3))
display(routes_std.head(3))
display(route_stops_std.head(3))

표준 저장 완료


Unnamed: 0,citycode,lat,lon,station_id,station_name,nodeno
0,38010,35.15743613,128.5529312,CWB379001513,우산마을,306613
1,38010,35.15832274,128.5538513,CWB379001508,우산마을,306612
2,38010,35.15836293,128.61066628,CWB379001503,석교종점,207802


Unnamed: 0,endnodenm,endvehicletime,route_id,route_num,route_tp,startnodenm,startvehicletime
0,창원역,2210,CWB379000010,1,마을버스,신전,540
1,창원역,2145,CWB379000020,2,마을버스,유등,720
2,진영시외주차장,1655,CWB379000030,3,마을버스,상리,755


Unnamed: 0,gpslati,gpslong,station_id,nodenm,nodeno,nodeord,route_id,updowncd
0,35.367096,128.674078,CWB379003142,신전,103719,1,CWB379000010,0
1,35.36602,128.677797,CWB379000045,상리,100702,2,CWB379000010,0
2,35.366533,128.680975,CWB379000041,갈전운동장,100703,3,CWB379000010,0


### 참조 무결성(FK) 점검

In [16]:
fk_route = route_stops_std["route_id"].isin(routes_std["route_id"]).mean()
fk_stop  = route_stops_std["station_id"].isin(stops_std["station_id"]).mean()

print("rows(stops, routes, route_stops):", len(stops_std), len(routes_std), len(route_stops_std))
print("FK(route_stops→routes):", round(fk_route,4))
print("FK(route_stops→stops): ", round(fk_stop,4))

rows(stops, routes, route_stops): 302 10 100
FK(route_stops→routes): 1.0
FK(route_stops→stops):  0.09


### (FK값 하락으로 정비)라우트 전량 + 경유정류소 전량(페이지네이션)

In [17]:
# 셀 A) routes 전량 수집 (페이지네이션)
import math, time, requests, pandas as pd
from urllib.parse import unquote
import os

BASE = "http://apis.data.go.kr/1613000"
DEC_KEY = unquote(os.getenv("CHANGWON_KEY_URLENC"))
CITY_CODE = "38010"
OUT = "/content/drive/MyDrive/data_j/raw/tago"

def extract_items(js):
    try:
        if str(js["response"]["header"].get("resultCode")) != "00":
            return [], 0
        body = js["response"]["body"]
        total = int(body.get("totalCount", 0))
    except Exception:
        return [], 0
    items = body.get("items", None)
    if items in (None, "", []): return [], total
    if isinstance(items, list):  return items, total
    if isinstance(items, dict):
        it = items.get("item", None)
        if it is None: return [], total
        return (it if isinstance(it, list) else [it]), total
    return [], total

def get_route_nolist_all(city_code, page_size=200):
    all_rows = []
    page = 1
    while True:
        url = f"{BASE}/BusRouteInfoInqireService/getRouteNoList"
        p   = {"serviceKey": DEC_KEY, "_type":"json",
               "cityCode": city_code, "pageNo": page, "numOfRows": page_size}
        js  = requests.get(url, params=p, timeout=30).json()
        rows, total = extract_items(js)
        if not rows: break
        all_rows += rows
        if page * page_size >= total: break
        page += 1; time.sleep(0.05)
    return pd.DataFrame(all_rows)

routes_df = get_route_nolist_all(CITY_CODE, page_size=200)
routes_df.to_csv(f"{OUT}/routes_tago.csv", index=False)
routes_df.to_csv(f"{OUT}/routes_tago_utf8sig.csv", index=False, encoding="utf-8-sig")
print("routes_tago.csv:", routes_df.shape)

# 셀 B) 모든 노선의 경유정류소 전량
def get_route_stops_all(city_code, route_id, page_size=200):
    all_rows = []; page = 1
    while True:
        url = f"{BASE}/BusRouteInfoInqireService/getRouteAcctoThrghSttnList"
        p   = {"serviceKey": DEC_KEY, "_type":"json",
               "cityCode": city_code, "routeId": route_id,
               "pageNo": page, "numOfRows": page_size}
        js  = requests.get(url, params=p, timeout=30).json()
        rows, total = extract_items(js)
        if not rows: break
        all_rows += rows
        if page * page_size >= total: break
        page += 1; time.sleep(0.03)
    return all_rows

rs_rows = []
for rid in pd.Series(routes_df.get("routeid")).dropna().astype(str).unique():
    try:
        rs_rows += get_route_stops_all(CITY_CODE, rid, page_size=200)
    except Exception:
        pass
    time.sleep(0.02)

route_stops_df = pd.DataFrame(rs_rows)
route_stops_df.to_csv(f"{OUT}/route_stops_tago.csv", index=False)
route_stops_df.to_csv(f"{OUT}/route_stops_tago_utf8sig.csv", index=False, encoding="utf-8-sig")
print("route_stops_tago.csv:", route_stops_df.shape)

routes_tago.csv: (169, 7)
route_stops_tago.csv: (16745, 8)


### 정류소 전수: getSttnNoList (페이지네이션)

In [18]:
import os, time, requests, pandas as pd
from urllib.parse import unquote

BASE = "http://apis.data.go.kr/1613000"         # 이번 세션은 HTTP가 정상
DEC_KEY = unquote(os.getenv("CHANGWON_KEY_URLENC"))
CITY_CODE = "38010"                              # 창원시
OUT = "/content/drive/MyDrive/data_j/raw/tago"
os.makedirs(OUT, exist_ok=True)

def extract_items_and_total(js):
    """items(list)와 totalCount(int)를 안전하게 추출"""
    try:
        if str(js["response"]["header"].get("resultCode")) != "00":
            return [], 0
        body = js["response"]["body"]
        total = int(body.get("totalCount", 0))
    except Exception:
        return [], 0
    items = body.get("items", None)
    if items in (None, "", []): return [], total
    if isinstance(items, list):  return items, total
    if isinstance(items, dict):
        it = items.get("item", None)
        if it is None: return [], total
        return (it if isinstance(it, list) else [it]), total
    return [], total

def get_stops_by_sttnno_all(city_code, page_size=500):
    """정류소번호 목록조회(getSttnNoList) 전량 수집"""
    all_rows = []
    page = 1
    while True:
        url = f"{BASE}/BusSttnInfoInqireService/getSttnNoList"
        p   = {"serviceKey": DEC_KEY, "_type":"json",
               "cityCode": city_code, "pageNo": page, "numOfRows": page_size}
        r   = requests.get(url, params=p, timeout=30)
        rows, total = extract_items_and_total(r.json())
        if not rows:
            break
        all_rows += rows
        if page * page_size >= total:
            break
        page += 1
        time.sleep(0.03)  # 매너 대기
    return pd.DataFrame(all_rows)

stops_full = get_stops_by_sttnno_all(CITY_CODE, page_size=500)
print("stops_full shape:", stops_full.shape)
display(stops_full.head(5))

# 저장: 분석용(utf-8) + 엑셀용(utf-8-sig)
stops_full.to_csv(f"{OUT}/stops_tago.csv", index=False)
stops_full.to_csv(f"{OUT}/stops_tago_utf8sig.csv", index=False, encoding="utf-8-sig")

# 표준 컬럼으로 리네임(+ 엑셀용 함께 저장)
stops_std = stops_full.rename(columns={
    "nodeid":"station_id", "nodenm":"station_name", "gpslati":"lat", "gpslong":"lon"
})
stops_std.to_csv(f"{OUT}/stops_std.csv", index=False)
stops_std.to_csv(f"{OUT}/stops_std_utf8sig.csv", index=False, encoding="utf-8-sig")
print("stops_std saved:", stops_std.shape)

stops_full shape: (2725, 5)


Unnamed: 0,gpslati,gpslong,nodeid,nodenm,nodeno
0,35.330126,128.600799,CWB379000164,신동,119340
1,35.328941,128.589327,CWB379000165,북면양촌,113603
2,35.327622,128.687349,CWB379000166,백양마을,100205
3,35.327495,128.68733,CWB379000167,백양마을,100206
4,35.328692,128.584734,CWB379000168,신음,113602


stops_std saved: (2725, 5)


### 라우트·경유정류소 표준화 재적재

In [19]:
# 기존에 전량으로 받은 routes/route_stops 재적재
routes_df = pd.read_csv(f"{OUT}/routes_tago.csv", dtype=str)
route_stops_df = pd.read_csv(f"{OUT}/route_stops_tago.csv", dtype=str)

routes_std = routes_df.rename(columns={"routeid":"route_id","routeno":"route_num","routetp":"route_tp"})
route_stops_std = route_stops_df.rename(columns={"routeid":"route_id","nodeid":"station_id","seq":"station_ord"})

routes_std.to_csv(f"{OUT}/routes_std.csv", index=False)
routes_std.to_csv(f"{OUT}/routes_std_utf8sig.csv", index=False, encoding="utf-8-sig")
route_stops_std.to_csv(f"{OUT}/route_stops_std.csv", index=False)
route_stops_std.to_csv(f"{OUT}/route_stops_std_utf8sig.csv", index=False, encoding="utf-8-sig")

display(routes_std.head(3))
display(route_stops_std.head(3))

Unnamed: 0,endnodenm,endvehicletime,route_id,route_num,route_tp,startnodenm,startvehicletime
0,창원역,2210,CWB379000010,1,마을버스,신전,540
1,창원역,2145,CWB379000020,2,마을버스,유등,720
2,진영시외주차장,1655,CWB379000030,3,마을버스,상리,755


Unnamed: 0,gpslati,gpslong,station_id,nodenm,nodeno,nodeord,route_id,updowncd
0,35.36709605,128.6740781,CWB379003142,신전,103719,1,CWB379000010,0
1,35.36602048,128.6777968,CWB379000045,상리,100702,2,CWB379000010,0
2,35.36653344,128.6809755,CWB379000041,갈전운동장,100703,3,CWB379000010,0


### FK 무결성 재점검 (전량 기준)

In [20]:
fk_route = route_stops_std["route_id"].isin(routes_std["route_id"]).mean()
fk_stop  = route_stops_std["station_id"].isin(stops_std["station_id"]).mean()

print("rows(stops, routes, route_stops):", len(stops_std), len(routes_std), len(route_stops_std))
print("FK(route_stops→routes):", round(fk_route,4))
print("FK(route_stops→stops): ", round(fk_stop,4))

rows(stops, routes, route_stops): 2725 169 16745
FK(route_stops→routes): 1.0
FK(route_stops→stops):  1.0


### Step3: 도착정보 샘플링

In [21]:
import os, time, math, requests, pandas as pd, numpy as np
from urllib.parse import unquote
from datetime import datetime, timezone
from zoneinfo import ZoneInfo

# 고정 세팅
BASE = "http://apis.data.go.kr/1613000"   # 이번 세션은 HTTP가 정상
CITY_CODE = "38010"                       # 창원시
KST = ZoneInfo("Asia/Seoul")

# 키 로드 (Decoding Key: ==)
ENC_KEY = os.getenv("CHANGWON_KEY_URLENC")
assert ENC_KEY, "환경변수 CHANGWON_KEY_URLENC 없음 (.env 확인)"
DEC_KEY = unquote(ENC_KEY)

# 저장 경로
OUT = "/content/drive/MyDrive/data_j/step3_arrivals"
os.makedirs(OUT, exist_ok=True)

def extract_items(js):
    """
    TAGO 응답에서 items를 안전하게 꺼낸다.
    - body.items == "" → []
    - body.items.item == dict → [dict]
    - body.items.item == list → list
    """
    try:
        if str(js["response"]["header"].get("resultCode")) != "00":
            return []
        body = js["response"]["body"]
    except Exception:
        return []
    items = body.get("items", None)
    if items in (None, "", []): return []
    if isinstance(items, list):  return items
    if isinstance(items, dict):
        it = items.get("item", None)
        if it is None: return []
        return it if isinstance(it, list) else [it]
    return []

def now_kst_iso():
    return datetime.now(tz=KST).isoformat(timespec="seconds")

### “샘플링 대상 정류소” 고르기 (전송량 최소로 핵심 허브 중심)

In [22]:
# 전 단계에서 만든 표준 CSV를 사용
BASE_DIR = "/content/drive/MyDrive/data_j/raw/tago"
stops_std = pd.read_csv(f"{BASE_DIR}/stops_std.csv", dtype=str)
routes_std = pd.read_csv(f"{BASE_DIR}/routes_std.csv", dtype=str)
route_stops_std = pd.read_csv(f"{BASE_DIR}/route_stops_std.csv", dtype=str)

# 정류소별 '경유 노선 수' 계산
deg = (route_stops_std.groupby("station_id")["route_id"]
       .nunique().reset_index(name="route_degree"))

# 허브 상위 N개 선정 (필요시 수 조정)
N_HUBS = 20
hubs = (deg.sort_values("route_degree", ascending=False)
            .head(N_HUBS)
            .merge(stops_std[["station_id","station_name","lat","lon"]], on="station_id", how="left"))

hubs.to_csv(f"{OUT}/sampling_hubs.csv", index=False)
hubs.head(10)

Unnamed: 0,station_id,route_degree,station_name,lat,lon
0,CWB379001394,64,문화동,35.18424623,128.5602673
1,CWB379001392,63,문화동,35.18458043,128.5602312
2,CWB379001246,62,경남데파트,35.20119382,128.5673133
3,CWB379001224,61,KT마산점,35.20299298,128.57082208
4,CWB379001284,58,마산합포구청.의료원,35.19785457,128.5670795
5,CWB379001376,58,연세병원,35.18717617,128.5615655
6,CWB379001331,57,중부경찰서,35.19256096,128.5650385
7,CWB379001228,56,KT마산점,35.20283824,128.5703212
8,CWB379001328,56,중부경찰서,35.1929213,128.5650943
9,CWB379001344,55,반월민원센터,35.19041961,128.5630911


### 도착정보 1회 호출 함수 (강건성 + 키/파라미터 안전)

In [23]:
def get_arrivals_once(city_code: str, node_id: str):
    """
    정류소 기준 도착예정 목록
    - 엔드포인트: ArvlInfoInqireService/getSttnAcctoArvlPrearngeInfoList
    - 반환: list[dict] (키는 소문자 변환)
    """
    url = f"{BASE}/ArvlInfoInqireService/getSttnAcctoArvlPrearngeInfoList"
    p = {"serviceKey": DEC_KEY, "_type": "json", "cityCode": city_code, "nodeId": node_id}
    r = requests.get(url, params=p, timeout=20)
    items = extract_items(r.json())

    # 키 소문자 변환 + station_id, ts 추가
    ts = now_kst_iso()
    out = []
    for it in items:
        row = {str(k).lower(): v for k, v in it.items()}
        row["station_id"] = node_id
        row["ts"] = ts
        out.append(row)
    return out

### 파일럿 샘플링 루프 (5~10분, 30초 간격)

In [24]:
import itertools

# 샘플링 설정
TARGET_NODES = hubs["station_id"].astype(str).tolist()  # 허브 상위 N
INTERVAL_SEC = 30
DURATION_MIN = 8

rows = []
start = datetime.now(tz=KST)
end_at = start.timestamp() + DURATION_MIN * 60

print(f"샘플링 시작: {start.isoformat()} / 대상 정류소 수={len(TARGET_NODES)}")
iter_count = 0

while datetime.now(tz=KST).timestamp() < end_at:
    ts = now_kst_iso()
    for nid in TARGET_NODES:
        try:
            items = get_arrivals_once(CITY_CODE, nid)
            for it in items:
                rows.append(it)
        except Exception as e:
            # 일시 오류는 무시하고 계속
            pass
        time.sleep(0.1)  # 매너 타임(순차 호출 간)
    iter_count += 1
    print(f"[{ts}] 라운드 {iter_count} 완료 - 누적 {len(rows)} 행")
    # 라운드 간 대기
    time.sleep(max(0, INTERVAL_SEC - 0.1 * len(TARGET_NODES)))

# 저장
raw_df = pd.DataFrame(rows)
raw_path = f"{OUT}/arrivals_raw_{datetime.now(tz=KST).strftime('%Y%m%d_%H%M%S')}.csv"
raw_df.to_csv(raw_path, index=False)
raw_df.to_csv(raw_path.replace(".csv", "_utf8sig.csv"), index=False, encoding="utf-8-sig")
print("샘플링 저장:", raw_path, raw_df.shape)
raw_df.head(10)

샘플링 시작: 2025-09-07T21:11:27.237009+09:00 / 대상 정류소 수=20
[2025-09-07T21:11:27+09:00] 라운드 1 완료 - 누적 198 행
[2025-09-07T21:12:15+09:00] 라운드 2 완료 - 누적 395 행
[2025-09-07T21:12:55+09:00] 라운드 3 완료 - 누적 595 행
[2025-09-07T21:13:36+09:00] 라운드 4 완료 - 누적 795 행
[2025-09-07T21:14:32+09:00] 라운드 5 완료 - 누적 994 행
[2025-09-07T21:15:12+09:00] 라운드 6 완료 - 누적 1194 행
[2025-09-07T21:15:51+09:00] 라운드 7 완료 - 누적 1394 행
[2025-09-07T21:16:31+09:00] 라운드 8 완료 - 누적 1593 행
[2025-09-07T21:17:11+09:00] 라운드 9 완료 - 누적 1789 행
[2025-09-07T21:17:50+09:00] 라운드 10 완료 - 누적 1989 행
[2025-09-07T21:18:30+09:00] 라운드 11 완료 - 누적 2189 행
[2025-09-07T21:19:09+09:00] 라운드 12 완료 - 누적 2389 행
샘플링 저장: /content/drive/MyDrive/data_j/step3_arrivals/arrivals_raw_20250907_211949.csv (2389, 10)


Unnamed: 0,arrprevstationcnt,arrtime,nodeid,nodenm,routeid,routeno,routetp,vehicletp,station_id,ts
0,33,2125,CWB379001394,문화동,CWB379000640,64,지선버스,일반차량,CWB379001394,2025-09-07T21:11:28+09:00
1,2,26,CWB379001394,문화동,CWB379000700,70,지선버스,저상버스,CWB379001394,2025-09-07T21:11:28+09:00
2,11,898,CWB379001394,문화동,CWB379000710,71,지선버스,저상버스,CWB379001394,2025-09-07T21:11:28+09:00
3,2,170,CWB379001394,문화동,CWB379001600,160,간선버스,저상버스,CWB379001394,2025-09-07T21:11:28+09:00
4,7,409,CWB379001394,문화동,CWB379002530,253,지선버스,저상버스,CWB379001394,2025-09-07T21:11:28+09:00
5,19,1153,CWB379001394,문화동,CWB379002630,263,지선버스,일반차량,CWB379001394,2025-09-07T21:11:28+09:00
6,1,8,CWB379001394,문화동,CWB379002770,277,지선버스,일반차량,CWB379001394,2025-09-07T21:11:28+09:00
7,10,1114,CWB379001394,문화동,CWB379007200,720,좌석버스,일반차량,CWB379001394,2025-09-07T21:11:28+09:00
8,20,1140,CWB379001392,문화동,CWB379000200,20,지선버스,저상버스,CWB379001392,2025-09-07T21:11:29+09:00
9,31,1504,CWB379001392,문화동,CWB379000220,22,지선버스,저상버스,CWB379001392,2025-09-07T21:11:29+09:00


### 헤드웨이 이벤트 검출 → 평균/표준편차/정시성(CV)

In [27]:
import pandas as pd, numpy as np

raw_df = pd.read_csv("/content/drive/MyDrive/data_j/step3_arrivals/arrivals_raw_20250907_211949.csv", dtype=str, parse_dates=["ts"])

def safe_to_float(s):
    try: return float(s)
    except: return np.nan

# 규칙별 플래그
raw_df["arrtime_f"] = raw_df["arrtime"].map(safe_to_float)
raw_df["arrprev_f"] = raw_df["arrprevstationcnt"].map(safe_to_float)

raw_df["flag_prev0"] = (raw_df["arrprev_f"] == 0)
raw_df["flag_arr60"] = (raw_df["arrtime_f"] <= 60)

print("총 행:", len(raw_df))
print("arrprev==0 비율:", raw_df["flag_prev0"].mean())
print("arrtime<=60 비율:", raw_df["flag_arr60"].mean())

# (정류소, 노선)별로 이벤트 카운트
evt_df = (raw_df[ raw_df["flag_prev0"] | raw_df["flag_arr60"] ]
          .groupby(["station_id","routeid"]).size().reset_index(name="n_events"))
evt_df = evt_df.sort_values("n_events", ascending=False)
evt_df.head(10)

총 행: 2389
arrprev==0 비율: 0.0
arrtime<=60 비율: 0.06111343658434491


Unnamed: 0,station_id,routeid,n_events
17,CWB379001139,CWB379000620,5
15,CWB379001114,CWB379001630,5
39,CWB379001331,CWB379000700,5
20,CWB379001173,CWB379000620,4
22,CWB379001228,CWB379000510,4
25,CWB379001228,CWB379001000,4
24,CWB379001228,CWB379000700,4
43,CWB379001331,CWB379001600,4
31,CWB379001312,CWB379000700,4
27,CWB379001228,CWB379001070,4


### 보정안 A: “리셋” 감지 규칙 완화 (+차이 임계값 상향)

In [28]:
import math, numpy as np
import pandas as pd

def detect_events_per_group_relaxed(df_g):
    # 시간 정렬
    g = df_g.sort_values("ts").copy()

    # 안전 변환
    g["arrtime_f"] = g["arrtime"].map(safe_to_float)
    g["arrprev_f"] = g["arrprevstationcnt"].map(safe_to_float)

    # 1) 직접 이벤트(도착 직전) 신호
    conds = []
    if "arrprevstationcnt" in g.columns:
        conds.append(g["arrprev_f"] == 0)
    if "arrtime" in g.columns:
        conds.append(g["arrtime_f"] <= 60)
    direct_evt = np.logical_or.reduce(conds) if conds else np.array([False]*len(g))

    # 2) 리셋(사이클 전환) 신호
    #   arrtime가 이전보다 +300초 이상 증가(새 차량 사이클 시작 추정)
    #   arrprev가 이전보다 +3 이상 증가(뒤차의 값으로 점프)
    arrtime_diff = g["arrtime_f"].diff()
    arrprev_diff = g["arrprev_f"].diff()
    reset_evt = ((arrtime_diff > 300) | (arrprev_diff > 3)).fillna(False)

    # 최종 이벤트 후보: direct OR reset
    flags = (direct_evt | reset_evt).to_numpy()
    event_ts = pd.to_datetime(g.loc[flags, "ts"]).tolist()

    # 인접 이벤트 간 시간차 → headway
    headways = []
    for a, b in zip(event_ts, event_ts[1:]):
        delta = (b - a).total_seconds()
        # 완화: 90초~7200초(2시간) 범위만 유효
        if 90 <= delta <= 7200:
            headways.append(delta)
    return event_ts, headways

def compute_headway_metrics_relaxed(raw_df):
    if raw_df.empty:
        return pd.DataFrame(columns=[
            "station_id","route_id","n_events","n_headways",
            "headway_mean_s","headway_std_s","headway_cv"
        ])
    df = raw_df.copy()
    if "routeid" in df.columns and "route_id" not in df.columns:
        df["route_id"] = df["routeid"]

    # 그룹핑
    out_rows=[]
    for (sid, rid), g in df.groupby(["station_id","route_id"], dropna=True):
        ev_ts, headways = detect_events_per_group_relaxed(g)
        if len(headways) == 0:
            continue
        mean_s = float(np.mean(headways))
        std_s  = float(np.std(headways, ddof=1)) if len(headways) > 1 else np.nan
        cv     = (std_s/mean_s) if (not math.isnan(std_s) and mean_s>0) else np.nan
        out_rows.append({
            "station_id": sid, "route_id": rid,
            "n_events": len(ev_ts), "n_headways": len(headways),
            "headway_mean_s": round(mean_s,1),
            "headway_std_s": round(std_s,1) if not math.isnan(std_s) else np.nan,
            "headway_cv": round(cv,3) if cv==cv else np.nan
        })
    return pd.DataFrame(out_rows)

agg_relaxed = compute_headway_metrics_relaxed(raw_df)
agg_relaxed_path = "/content/drive/MyDrive/data_j/step3_arrivals/headway_station_route_relaxed_20250907_211949.csv"
agg_relaxed.to_csv(agg_relaxed_path, index=False)
agg_relaxed.to_csv(agg_relaxed_path.replace(".csv","_utf8sig.csv"), index=False, encoding="utf-8-sig")
print("헤드웨이(완화) 요약:", agg_relaxed.shape)
display(agg_relaxed.sort_values("headway_cv").head(10))

헤드웨이(완화) 요약: (3, 7)


Unnamed: 0,station_id,route_id,n_events,n_headways,headway_mean_s,headway_std_s,headway_cv
0,CWB379001228,CWB379001000,5,1,317.0,,
1,CWB379001312,CWB379001000,3,1,414.0,,
2,CWB379001331,CWB379001000,2,1,459.0,,


### 보정안 B: “분 단위 롤링 최소값” 기반(폴링 한계 보완)

In [29]:
def compute_headway_bucketed(raw_df, bucket_sec=60):
    df = raw_df.copy()
    df["route_id"]  = df.get("route_id", df.get("routeid"))
    df["ts_dt"]     = pd.to_datetime(df["ts"])
    df["bucket"]    = (df["ts_dt"].astype("int64")//10**9 // bucket_sec).astype(int)

    rows=[]
    for (sid, rid, b), g in df.groupby(["station_id","route_id","bucket"], dropna=True):
        # 이 버킷에서 arrtime 가장 작은 순간을 대표 이벤트로 선택
        g["arrtime_f"] = g["arrtime"].map(safe_to_float)
        idx = g["arrtime_f"].idxmin()
        rows.append({
            "station_id": sid, "route_id": rid,
            "bucket": b, "evt_ts": g.loc[idx, "ts_dt"],
            "arrtime_min": g.loc[idx, "arrtime_f"]
        })
    rep = pd.DataFrame(rows)
    if rep.empty:
        return pd.DataFrame(columns=["station_id","route_id","headway_mean_s","headway_std_s","headway_cv","n_headways"])

    out=[]
    for (sid, rid), g in rep.groupby(["station_id","route_id"]):
        g = g.sort_values("evt_ts")
        deltas = np.diff(g["evt_ts"]).astype("timedelta64[s]").astype(float)
        # 품질필터: 60~7200초
        deltas = [d for d in deltas if 60 <= d <= 7200]
        if len(deltas)==0:
            continue
        mean_s = float(np.mean(deltas))
        std_s  = float(np.std(deltas, ddof=1)) if len(deltas)>1 else np.nan
        cv     = (std_s/mean_s) if (not math.isnan(std_s) and mean_s>0) else np.nan
        out.append({
            "station_id": sid, "route_id": rid, "n_headways": len(deltas),
            "headway_mean_s": round(mean_s,1),
            "headway_std_s": round(std_s,1) if not math.isnan(std_s) else np.nan,
            "headway_cv": round(cv,3) if cv==cv else np.nan
        })
    return pd.DataFrame(out)

agg_bucket = compute_headway_bucketed(raw_df, bucket_sec=60)
agg_bucket_path = "/content/drive/MyDrive/data_j/step3_arrivals/headway_station_route_bucketed_20250907_211949.csv"
agg_bucket.to_csv(agg_bucket_path, index=False)
agg_bucket.to_csv(agg_bucket_path.replace(".csv","_utf8sig.csv"), index=False, encoding="utf-8-sig")
print("헤드웨이(버킷법) 요약:", agg_bucket.shape)
display(agg_bucket.sort_values("headway_cv").head(10))

헤드웨이(버킷법) 요약: (223, 6)


Unnamed: 0,station_id,route_id,n_headways,headway_mean_s,headway_std_s,headway_cv
86,CWB379001228,CWB379002580,2,79.0,0.0,0.0
199,CWB379001394,CWB379000220,2,79.0,0.0,0.0
203,CWB379001394,CWB379000720,2,79.0,0.0,0.0
204,CWB379001394,CWB379001030,2,79.0,0.0,0.0
205,CWB379001394,CWB379001050,2,79.0,0.0,0.0
208,CWB379001394,CWB379002580,2,79.0,0.0,0.0
162,CWB379001344,CWB379002580,2,79.0,0.0,0.0
163,CWB379001344,CWB379002590,2,79.0,0.0,0.0
15,CWB379001031,CWB379000500,3,79.3,0.6,0.007
0,CWB379000951,CWB379000200,3,79.7,0.6,0.007


### 정류소 레벨로 집계

In [30]:
use = agg_relaxed if len(agg_relaxed) >= len(agg_bucket) else agg_bucket
station_agg = (use.groupby("station_id")
               .agg(headway_mean_s=("headway_mean_s","mean"),
                    headway_cv=("headway_cv","mean"),
                    n_routes=("route_id","nunique"),
                    n_pairs=("n_headways","sum"))
               .reset_index())
st_out = "/content/drive/MyDrive/data_j/step3_arrivals/headway_station_20250907_211949.csv"
station_agg.to_csv(st_out, index=False)
station_agg.to_csv(st_out.replace(".csv","_utf8sig.csv"), index=False, encoding="utf-8-sig")
print("정류소 집계 저장:", st_out, station_agg.shape)
station_agg.head(10)

정류소 집계 저장: /content/drive/MyDrive/data_j/step3_arrivals/headway_station_20250907_211949.csv (20, 5)


Unnamed: 0,station_id,headway_mean_s,headway_cv,n_routes,n_pairs
0,CWB379000951,81.2,0.0367,10,30
1,CWB379001031,81.01,0.0385,10,30
2,CWB379001107,82.436364,0.0369,11,30
3,CWB379001114,93.125,0.197889,12,26
4,CWB379001139,82.309091,0.0371,11,29
5,CWB379001173,82.590909,0.0389,11,29
6,CWB379001224,82.581818,0.0794,11,30
7,CWB379001228,116.430769,0.103714,13,26
8,CWB379001246,82.772727,0.0841,11,30
9,CWB379001284,82.318182,0.079,11,30


### 산업단지 SHP(경계도면)

In [32]:
!pip -q install geopandas shapely pyproj fiona pyogrio chardet

import os, chardet, geopandas as gpd

DRIVE_BASE = "/content/drive/MyDrive"
SHP_PATH   = f"{DRIVE_BASE}/data_j/external/dam_yuch.shp"
OUT_DIR    = f"{DRIVE_BASE}/data_j/industrial_parks"
os.makedirs(OUT_DIR, exist_ok=True)

def read_shp_robust(path):
    """
    1) .cpg가 있으면 먼저 그 인코딩 사용
    2) 없거나 실패하면 chardet으로 .dbf 샘플 인코딩 추정
    3) pyogrio 엔진 우선 → 실패하면 fiona 엔진으로 재시도
    4) 후보 인코딩 리스트 순차 재시도
    """
    base = os.path.splitext(path)[0]
    cpg_path = base + ".cpg"
    dbf_path = base + ".dbf"

    enc_candidates = []

    # (1) .cpg 힌트
    if os.path.exists(cpg_path):
        try:
            with open(cpg_path, "rb") as f:
                hint = f.read().decode("ascii", errors="ignore").strip().lower()
            # 흔한 표기들 통일
            if "euckr" in hint or "euc-kr" in hint:
                enc_candidates.append("euckr")
            elif "cp949" in hint or "ks_c_5601" in hint or "ksc5601" in hint:
                enc_candidates.append("cp949")
            elif "utf" in hint:
                enc_candidates.append("utf-8")
        except Exception:
            pass

    # (2) chardet으로 dbf 헤더 샘플 추정
    if os.path.exists(dbf_path):
        try:
            with open(dbf_path, "rb") as f:
                raw = f.read(4096)  # 샘플만
            det = chardet.detect(raw)
            if det and det.get("encoding"):
                enc_candidates.append(det["encoding"].lower())
        except Exception:
            pass

    # (3) 일반적인 후보들 추가
    enc_candidates += ["utf-8", "cp949", "euckr", "latin1"]

    # 중복 제거(순서 유지)
    seen=set(); enc_candidates = [e for e in enc_candidates if not (e in seen or seen.add(e))]

    last_err = None
    for engine in ["pyogrio", "fiona"]:
        for enc in enc_candidates:
            try:
                gdf = gpd.read_file(path, engine=engine, encoding=enc)
                print(f"✅ read_file 성공: engine={engine}, encoding={enc}, rows={len(gdf)}")
                return gdf
            except Exception as e:
                last_err = e
                # 다음 후보로
                continue
    raise last_err if last_err else RuntimeError("SHP 로드 실패")

gdf_raw = read_shp_robust(SHP_PATH)
print("CRS:", gdf_raw.crs)
print("Columns:", list(gdf_raw.columns)[:12])
gdf_raw.head(2)

✅ read_file 성공: engine=pyogrio, encoding=cp949, rows=17017
CRS: PROJCS["KGD2002_Central_Belt_2010 (deprecated)",GEOGCS["KGD2002",DATUM["Korean_Geodetic_Datum_2002",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6737"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4737"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",38],PARAMETER["central_meridian",127],PARAMETER["scale_factor",1],PARAMETER["false_easting",200000],PARAMETER["false_northing",600000],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["ESRI","102082"]]
Columns: ['DAN_ID', 'UPJ_ID', 'UPJ_CODE', 'UPJ2', 'UPJ3', 'UPJ4', 'UPJ5', 'UPJ6', 'CRE_DATE', 'UPD_DATE', 'GOSI', 'geometry']


Unnamed: 0,DAN_ID,UPJ_ID,UPJ_CODE,UPJ2,UPJ3,UPJ4,UPJ5,UPJ6,CRE_DATE,UPD_DATE,GOSI,geometry
0,243980,26000,26,26,C,26,C26,"C26.전자부품,컴퓨터,영상,음향및 통신장비 제조업",20211213,,충청북도 고시 제2021-330호,"POLYGON ((240436.62 482610.885, 240433.244 482..."
1,243980,21000,21,21,C,21,C21,C21.의료용 물질 및 의약품 제조업,20211213,,충청북도 고시 제2021-330호,"POLYGON ((240095.112 481877.498, 240096.825 48..."


In [34]:
# 컬럼 표준화 + 좌표계 통일 + 면적 계산
import geopandas as gpd, pandas as pd
import re

# gdf_raw 는 이미 read_file 성공한 객체 (engine=pyogrio, encoding=cp949)
gdf = gdf_raw.copy()

# 1) 컬럼명 소문자/언더스코어로 통일
gdf.columns = [re.sub(r"\s+","_", c.strip().lower()) for c in gdf.columns]

# 2) 우리가 보유한 컬럼에서 최소 표준 필드 만들기
#   - park_id : dan_id
#   - (단지명 없음) → park_name은 임시로 None
#   - (단지유형 없음) → park_type_code / park_type 은 생성하지 않음
if "dan_id" in gdf.columns:
    gdf["park_id"] = gdf["dan_id"].astype(str)
else:
    raise ValueError("DAN_ID(=dan_id) 컬럼이 없어 park_id를 만들 수 없습니다.")

# 3) 좌표계/면적 계산
#    현재 CRS: ESRI:102082 (단위: metre) → 면적 계산에 바로 사용 가능
if gdf.crs is None:
    # 안전장치: 없을 경우 미터계로 가정
    gdf = gdf.set_crs("ESRI:102082", allow_override=True)

gdf_m = gdf.to_crs(gdf.crs)          # 그대로(미터계)
gdf_w = gdf.to_crs(epsg=4326)        # 시각화/조인을 위한 WGS84

gdf_m["area_m2"] = gdf_m.geometry.area
gdf_w["area_m2"] = gdf_m["area_m2"].values

print("rows:", len(gdf_w), "| crs:", gdf_w.crs)
print("sample cols:", [c for c in gdf_w.columns if c!='geometry'][:12])
gdf_w[["park_id","upj_code","gosi","area_m2"]].head()

rows: 17017 | crs: EPSG:4326
sample cols: ['dan_id', 'upj_id', 'upj_code', 'upj2', 'upj3', 'upj4', 'upj5', 'upj6', 'cre_date', 'upd_date', 'gosi', 'park_id']


Unnamed: 0,park_id,upj_code,gosi,area_m2
0,243980,26,충청북도 고시 제2021-330호,7584.053389
1,243980,21,충청북도 고시 제2021-330호,47420.354136
2,243980,21,충청북도 고시 제2021-330호,19812.349713
3,243980,28,충청북도 고시 제2021-330호,51215.687425
4,244780,29,천안시 고시 제2021-431호,47117.691311


In [36]:
# “창원만” 추출(이름 대신 공간 필터만 사용)
import pandas as pd
from shapely.geometry import box
import geopandas as gpd

# 정류소 범위 기반 BBox (이미 우리가 만든 stops_std 활용)
DRIVE_BASE = "/content/drive/MyDrive"
stops_std = pd.read_csv(f"{DRIVE_BASE}/data_j/raw/tago/stops_std.csv", dtype=str)
stops_std["lat"] = stops_std["lat"].astype(float)
stops_std["lon"] = stops_std["lon"].astype(float)
lat_min, lat_max = stops_std["lat"].min(), stops_std["lat"].max()
lon_min, lon_max = stops_std["lon"].min(), stops_std["lon"].max()

pad = 0.02
bbox = box(lon_min-pad, lat_min-pad, lon_max+pad, lat_max+pad)
bbox_gdf = gpd.GeoDataFrame(geometry=[bbox], crs="EPSG:4326")

# gdf_w 는 방금 만든 WGS84 버전
mask_geom = gdf_w.geometry.intersects(bbox_gdf.iloc[0].geometry)
gdf_cw = gdf_w[mask_geom].copy()

# park_id 기준 중복 제거
gdf_cw = gdf_cw.drop_duplicates(subset=["park_id"])

print("창원 후보 수:", len(gdf_cw))
gdf_cw[["park_id","upj_code","area_m2"]].head(10)

창원 후보 수: 68


Unnamed: 0,park_id,upj_code,area_m2
253,248950,25,15312.244
281,248890,29,26449.709921
316,448050,D,9910.096613
357,248AW0,23,38273.322904
370,248830,25,2384.056036
399,448110,25,96.817873
694,248110,28,28471.201069
1235,348010,J,9735.193906
1291,248080,25,37547.925249
1674,448400,D,7992.468296


In [37]:
OUT_DIR = f"{DRIVE_BASE}/data_j/industrial_parks"

# 전체/창원 저장
gdf_w.to_file(f"{OUT_DIR}/industrial_parks_all.gpkg", layer="industrial_parks", driver="GPKG")
gdf_w.to_file(f"{OUT_DIR}/industrial_parks_all.geojson", driver="GeoJSON")

gdf_cw.to_file(f"{OUT_DIR}/industrial_parks_cw.gpkg", layer="industrial_parks_cw", driver="GPKG")
gdf_cw.to_file(f"{OUT_DIR}/industrial_parks_cw.geojson", driver="GeoJSON")

# 센트로이드 CSV (단지명 없이 park_id / 좌표 / 면적)
cent = gdf_cw.copy()
cent["centroid"] = cent.geometry.centroid
cent["lat"] = cent["centroid"].y
cent["lon"] = cent["centroid"].x
cent[["park_id","upj_code","area_m2","lat","lon"]].to_csv(f"{OUT_DIR}/industrial_parks_cw_centroids.csv", index=False)
cent[["park_id","upj_code","area_m2","lat","lon"]].to_csv(f"{OUT_DIR}/industrial_parks_cw_centroids_utf8sig.csv", index=False, encoding="utf-8-sig")

print("저장 완료:", OUT_DIR)

저장 완료: /content/drive/MyDrive/data_j/industrial_parks



  cent["centroid"] = cent.geometry.centroid


In [39]:
import geopandas as gpd

DRIVE_BASE = "/content/drive/MyDrive"
OUT_DIR    = f"{DRIVE_BASE}/data_j/industrial_parks"

# 창원 subset 불러오기
gdf_cw = gpd.read_file(f"{OUT_DIR}/industrial_parks_cw.geojson")

# 1) park_id 단위 dissolve (투영 좌표계에서 수행)
gdf_cw_5179 = gdf_cw.to_crs(epsg=5179)
gdf_pk = gdf_cw_5179.dissolve(by="park_id", as_index=False)

# 2) 면적 + 정확한 센트로이드(미터계에서 계산)
gdf_pk["area_m2"] = gdf_pk.geometry.area
cent_gs_5179 = gdf_pk.geometry.centroid       # GeoSeries (geometry dtype)

# 3) 센트로이드를 WGS84 좌표로 변환해서 숫자 컬럼으로 저장
cent_gs_4326 = cent_gs_5179.to_crs(4326)
gdf_pk["cent_lat"] = cent_gs_4326.y
gdf_pk["cent_lon"] = cent_gs_4326.x

# (옵션) 센트로이드 지오메트를 보존하고 싶다면 WKT로 저장
gdf_pk["centroid_wkt"] = cent_gs_5179.to_wkt()  # 문자열 컬럼 (geometry 아님)

# 4) geometry 컬럼은 폴리곤 하나만 유지해야 함 → 'centroid' 같은 geometry 컬럼 절대 남기지 않기
#    (우린 cent_gs_*를 직접 컬럼에 대입하지 않았으므로 geometry는 1개뿐)
gdf_pk = gdf_pk.set_geometry("geometry")

# 5) 저장 시 좌표계 WGS84로 변환
gdf_pk_4326 = gdf_pk.to_crs(4326)

# 저장 (단 한 개의 geometry 컬럼만 보유)
gdf_pk_4326.to_file(f"{OUT_DIR}/industrial_parks_cw_parklevel.gpkg",
                    layer="industrial_parks_cw_parklevel", driver="GPKG")
gdf_pk_4326.to_file(f"{OUT_DIR}/industrial_parks_cw_parklevel.geojson",
                    driver="GeoJSON")

cols = ["park_id","area_m2","cent_lat","cent_lon","centroid_wkt"]
gdf_pk_4326[cols].to_csv(f"{OUT_DIR}/industrial_parks_cw_parklevel.csv", index=False)
gdf_pk_4326[cols].to_csv(f"{OUT_DIR}/industrial_parks_cw_parklevel_utf8sig.csv",
                         index=False, encoding="utf-8-sig")

print("저장 완료(park-level):", OUT_DIR, "| 단지 수:", len(gdf_pk_4326))

저장 완료(park-level): /content/drive/MyDrive/data_j/industrial_parks | 단지 수: 68


# 창원시 공공데이터 공모전