# Spatial mapping: crosswalks_passenger_unique × passenger_accident

This notebook maps pedestrian-accident hotspot points to nearby crosswalk facilities using geographic distance.

**Outputs**
1. `crosswalks_with_accidents_100m.csv`: one row per crosswalk, with accident stats aggregated within a radius.
2. `accidents_nearest_crosswalk.csv`: one row per accident, matched to its nearest crosswalk (with distance).

You can adjust the radius and output filenames in the configuration cell below.


In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

# -----------------------------
# Configuration
# -----------------------------
CROSSWALK_CSV = '../cw.csv'
ACCIDENT_CSV  = '../acc_hotspot.csv'

# Radius for "nearby accidents" aggregation
RADIUS_M = 100  # meters

OUT_CW_AGG = 'crosswalks_with_accidents_100m.csv'
OUT_ACC_NN = 'accidents_nearest_crosswalk.csv'

# Encoding candidates (files sometimes come in cp949 / euc-kr)
ENCODINGS = ['utf-8', 'cp949', 'euc-kr']


In [2]:
def read_csv_auto(path, encodings=ENCODINGS, **kwargs):
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, **kwargs)
        except Exception as e:
            last_err = e
    raise last_err

cw = read_csv_auto(CROSSWALK_CSV)
acc = read_csv_auto(ACCIDENT_CSV)

print('crosswalks:', cw.shape)
print('accidents :', acc.shape)
cw.head()

crosswalks: (751439, 17)
accidents : (1984, 12)


Unnamed: 0,cw_uid,sido_code,district_code,address,crosswalk_lat,crosswalk_lon,crosswalk_type,lane_count,crosswalk_width,crosswalk_length,is_highland,has_ped_signal,has_ped_button,has_ped_sound,has_bump,has_braille_block,has_spotlight
0,CW_000001,5200000000,5213000000,전북특별자치도 군산시 영화동 6-8,35.989238,126.712927,1.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CW_000002,5200000000,5213000000,전북특별자치도 군산시 영화동 6-8,35.989238,126.712927,1.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CW_000003,5200000000,5213000000,전북특별자치도 군산시 영화동 6-8,35.989238,126.712927,1.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CW_000004,5200000000,5213000000,전북특별자치도 군산시 영화동 6-8,35.989238,126.712927,1.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CW_000005,5200000000,5213000000,전북특별자치도 군산시 영화동 27-36,35.988187,126.710793,1.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Column name normalization (supports both standard & prefixed variants)
def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

cw_lat_col = pick_col(cw, ['crosswalk_lat', 'lat', 'latitude', '위도'])
cw_lon_col = pick_col(cw, ['crosswalk_lon', 'lon', 'longitude', '경도'])

acc_lat_col = pick_col(acc, ['accident_lat', 'lat', 'latitude', '위도'])
acc_lon_col = pick_col(acc, ['accident_lon', 'lon', 'longitude', '경도'])

cw_id_col  = pick_col(cw, ['crosswalk_id', 'id','cw_uid'])
acc_id_col = pick_col(acc, ['accident_id', 'id'])

required = {
    'cw_lat': cw_lat_col, 'cw_lon': cw_lon_col,
    'acc_lat': acc_lat_col, 'acc_lon': acc_lon_col,
}
missing = [k for k,v in required.items() if v is None]
if missing:
    raise ValueError(f'Missing required coordinate columns: {missing}\n'
                     f'Crosswalk columns: {list(cw.columns)}\n'
                     f'Accident columns: {list(acc.columns)}')

print('Using columns:')
print('  crosswalk lat/lon:', cw_lat_col, cw_lon_col)
print('  accident  lat/lon:', acc_lat_col, acc_lon_col)
print('  crosswalk id     :', cw_id_col)
print('  accident  id     :', acc_id_col)


Using columns:
  crosswalk lat/lon: crosswalk_lat crosswalk_lon
  accident  lat/lon: accident_lat accident_lon
  crosswalk id     : cw_uid
  accident  id     : accident_id


In [4]:
# Clean coordinates
def to_float(s):
    return pd.to_numeric(s, errors='coerce')

cw = cw.copy()
acc = acc.copy()

cw[cw_lat_col] = to_float(cw[cw_lat_col])
cw[cw_lon_col] = to_float(cw[cw_lon_col])
acc[acc_lat_col] = to_float(acc[acc_lat_col])
acc[acc_lon_col] = to_float(acc[acc_lon_col])

cw_valid = cw.dropna(subset=[cw_lat_col, cw_lon_col]).reset_index(drop=True)
acc_valid = acc.dropna(subset=[acc_lat_col, acc_lon_col]).reset_index(drop=True)

print('valid crosswalks:', cw_valid.shape)
print('valid accidents :', acc_valid.shape)

if cw_valid.empty or acc_valid.empty:
    raise ValueError('No valid coordinate rows found in one of the datasets.')


valid crosswalks: (751436, 17)
valid accidents : (1984, 12)


In [5]:
# Build BallTree with haversine distance (expects radians)
EARTH_RADIUS_M = 6371000.0

cw_radians = np.deg2rad(cw_valid[[cw_lat_col, cw_lon_col]].to_numpy())
acc_radians = np.deg2rad(acc_valid[[acc_lat_col, acc_lon_col]].to_numpy())

tree = BallTree(cw_radians, metric='haversine')

# 1) Nearest crosswalk for each accident
dist_rad, idx = tree.query(acc_radians, k=1)
dist_m = dist_rad[:, 0] * EARTH_RADIUS_M
nearest_idx = idx[:, 0]

acc_nn = acc_valid.copy()
acc_nn['nearest_crosswalk_row'] = nearest_idx
acc_nn['distance_to_crosswalk_m'] = dist_m

# Attach crosswalk identifiers/attributes
cw_key_cols = []
if cw_id_col is not None:
    cw_key_cols.append(cw_id_col)
for extra in ['sido', 'sigungu', 'address', 'doro']:
    if extra in cw_valid.columns and extra not in cw_key_cols:
        cw_key_cols.append(extra)

cw_attach = cw_valid.loc[:, cw_key_cols + [cw_lat_col, cw_lon_col]].reset_index(drop=True)
acc_nn = acc_nn.join(cw_attach.loc[nearest_idx].reset_index(drop=True), rsuffix='_cw')

acc_nn.to_csv(OUT_ACC_NN, index=False, encoding='utf-8-sig')
print('saved:', OUT_ACC_NN)
acc_nn.head()

saved: accidents_nearest_crosswalk.csv


Unnamed: 0,accident_id,year,district_code,detail,accident_count,casualty_count,fatality_count,serious_injury_count,minor_injury_count,reported_injury_count,accident_lon,accident_lat,nearest_crosswalk_row,distance_to_crosswalk_m,cw_uid,address,crosswalk_lat,crosswalk_lon
0,6778887,2022,1111017100,성균관대입구교차로 부근,9,10,1,9,0,0,126.99878,37.58294,66842,4.597525,CW_066843,서울특별시 종로구 명륜2가,37.582899,126.998788
1,6779548,2022,1111013700,낙원상가 부근,8,8,0,8,0,0,126.988435,37.571899,61314,10.363748,CW_061315,서울특별시 종로구 낙원동,37.571856,126.98854
2,6779517,2022,1111011900,세종로180 부근,7,8,0,8,0,0,126.976899,37.569865,54327,17.396111,CW_054328,서울특별시 종로구 세종로,37.569761,126.976751
3,6779511,2022,1111017500,신설동역6번출구 부근,7,8,0,7,1,0,127.023747,37.575111,189356,10.753769,CW_189357,서울특별시 동대문구 신설동,37.575207,127.023763
4,6779131,2022,1111013800,종로2가교차로 부근,7,8,0,7,0,1,126.987315,37.570091,52857,5.809696,CW_052858,서울특별시 종로구 종로2가,37.5701,126.98738


In [7]:
# 2) Aggregate accidents within RADIUS_M of each crosswalk
radius_rad = RADIUS_M / EARTH_RADIUS_M

# Query neighbors for each crosswalk (accidents within radius)
acc_tree = BallTree(acc_radians, metric='haversine')
neighbors = acc_tree.query_radius(cw_radians, r=radius_rad)

cw_out = cw_valid.copy()
cw_out['accidents_within_radius'] = [len(n) for n in neighbors]

# Optional: sum accident_count/casualty/fatalities if present
sum_cols = [
    'accident_count', 'casualty_count', 'fatality_count',
    'serious_injury_count', 'minor_injury_count', 'reported_injury_count'
]
for col in sum_cols:
    if col in acc_valid.columns:
        cw_out[f'sum_{col}_within_{RADIUS_M}m'] = [acc_valid.loc[n, col].fillna(0).sum() if len(n) else 0 for n in neighbors]

# Optional: nearest accident distance for each crosswalk
dist_rad2, idx2 = acc_tree.query(cw_radians, k=1)
cw_out['nearest_accident_distance_m'] = dist_rad2[:, 0] * EARTH_RADIUS_M

cw_out.to_csv(OUT_CW_AGG, index=False, encoding='utf-8-sig')
print('saved:', OUT_CW_AGG)
cw_out[['accidents_within_radius', 'nearest_accident_distance_m']].describe()

saved: crosswalks_with_accidents_100m.csv


Unnamed: 0,accidents_within_radius,nearest_accident_distance_m
count,49563.0,49563.0
mean,0.028953,5446.130056
std,0.248779,7740.102405
min,0.0,2.236776
25%,0.0,1006.053567
50%,0.0,2443.133814
75%,0.0,5932.109512
max,4.0,51252.409779


## Notes
- Distance is computed with the Haversine metric (great-circle distance).
- If you want to map using a different radius (e.g., 50m/200m), just change `RADIUS_M` and re-run.
- If your dashboard needs one unified mapping table, you can join `accidents_nearest_crosswalk.csv` back into crosswalks by `crosswalk_id` (if available).
