# 할 일

1. 구별 클러스터링
2. 클러스터별 면적 계산
3. 사고발생건수/면적 계산: 기준 이상만 남기기

In [26]:
import warnings
import pyproj
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium import plugins
from statsmodels.api import OLS
from sklearn.cluster import DBSCAN
from functools import partial
from shapely.ops import transform
from shapely.geometry import Point
from folium.map import FeatureGroup, LayerControl

In [2]:
warnings.filterwarnings(action="ignore")

In [3]:
proj_wgs84 = pyproj.Proj('+proj=longlat +datum=WGS84')
def geodesic_point_buffer(point, meter):
    # Azimuthal equidistant projection
    aeqd_proj = '+proj=aeqd +lat_0={lat} +lon_0={lon} +x_0=0 +y_0=0'
    project = partial(
        pyproj.transform,
        pyproj.Proj(aeqd_proj.format(lat=point.y, lon=point.x)),
        proj_wgs84
    )
    buf = Point(0, 0).buffer(meter)  # distance in metres
    return transform(project, buf)

In [4]:
def calc_meter_area(polygon):
    proj = partial(
        pyproj.transform,
        pyproj.Proj(init='epsg:4326'),
        pyproj.Proj(init='epsg:3857')
    )
    s_new = transform(proj, polygon)
    return s_new.area

In [5]:
data = pd.read_json("data/kids-accident-pp.json")
data = data.assign(
    sido = [x.split()[0] for x in data.legaldong_name],
    gugun = [x.split()[1] for x in data.legaldong_name],
    acdnt_dd_dc = pd.to_datetime(data.acdnt_dd_dc, format="%Y-%m-%d")
)

In [6]:
gdf = gpd.read_file("data/geometry.geojson")
gdf = gdf.assign(rad_X = np.radians(gdf.geometry.x), rad_Y=np.radians(gdf.geometry.y))

# 1. 기준 설정

In [7]:
acdnt_cnt = (data.groupby(data.acdnt_dd_dc.dt.year)
                 .size()
                 .reset_index()
                 .rename(columns={"acdnt_dd_dc":"year", 0:"cnt"})
                 .assign(constant=1)
            )

In [8]:
ols_model = OLS(acdnt_cnt.cnt.values,acdnt_cnt[["constant","year"]]).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.957
Model:,OLS,Adj. R-squared:,0.953
Method:,Least Squares,F-statistic:,245.5
Date:,"Tue, 16 Jun 2020",Prob (F-statistic):,7.2e-09
Time:,00:20:41,Log-Likelihood:,-91.486
No. Observations:,13,AIC:,187.0
Df Residuals:,11,BIC:,188.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,7.054e+05,4.47e+04,15.787,0.000,6.07e+05,8.04e+05
year,-347.7637,22.197,-15.667,0.000,-396.620,-298.908

0,1,2,3
Omnibus:,0.697,Durbin-Watson:,0.847
Prob(Omnibus):,0.706,Jarque-Bera (JB):,0.015
Skew:,-0.076,Prob(JB):,0.992
Kurtosis:,3.071,Cond. No.,1080000.0


In [9]:
summation = 2
criterion = 2
print("year","|","criterion")
print("====","|","="*20)
print(acdnt_cnt.year.values[-1],"|",criterion)
for i in range(acdnt_cnt.shape[0]-1):
    criterion = criterion/ols_model.fittedvalues.values[-(i+1)] * ols_model.fittedvalues.values[-(i+2)]
    summation += criterion
    print(acdnt_cnt.year.values[-(i+2)], "|", criterion)

year | criterion
2019 | 2
2018 | 2.212058873785904
2017 | 2.424117747571808
2016 | 2.6361766213577122
2015 | 2.848235495143616
2014 | 3.0602943689295206
2013 | 3.2723532427154245
2012 | 3.484412116501329
2011 | 3.6964709902872324
2010 | 3.9085298640731367
2009 | 4.120588737859041
2008 | 4.332647611644946
2007 | 4.54470648543085


In [10]:
# 2건 X 13년 = 26
criterion = 26 / (300 * 300 * np.pi)

# 2. 클러스터 선정

In [11]:
seoul = data.query("sido=='서울특별시'").drop("sido", axis=1)

In [12]:
earth_radius_km = 6371
epsilon = 0.1 / earth_radius_km #calculate 150 meter epsilon threshold
min_samples = int(26/9)

In [13]:
result = []
for gu in seoul.gugun.unique():
    print(gu)
    # 클러스터 모델 적합
    df = seoul[seoul.gugun==gu]
    geo = gdf[gdf.acdnt_no.isin(df.acdnt_no)].drop("acdnt_no",axis=1)
    model = DBSCAN(
        eps=epsilon,
        min_samples=min_samples,
        n_jobs=6
    )
    model.fit(geo[["rad_X", "rad_Y"]])
    if model.labels_.max()==-1:
        print("-----------------------------------")
        print("클러스터 없음")
        print("-----------------------------------\n")
        continue
    before = pd.Series(model.labels_).value_counts().sort_index().reset_index().rename(columns={"index":"cluster", 0:"count"})
    
    # 밀도 계산
    geo = geo.assign(
        labels = model.labels_
    ).query("labels!=-1").assign(geometry = geo.geometry.apply(geodesic_point_buffer, meter=100))
    
    geo = pd.merge(
        geo.dissolve(by="labels").reset_index(),
        geo.labels.value_counts().reset_index().rename(columns={"labels":"cls_size", "index":"labels"})
    )
    geo = geo.assign(meter_area = geo.geometry.apply(calc_meter_area))
    geo = geo.assign(density = geo.cls_size/geo.meter_area)

    print("-----------------------------------")
    print(
        before.assign(
            alive = before.cluster.isin(geo[geo.density > criterion].labels),
            density = [0] + list(geo.sort_values("labels").density)
        )
    )
    print("-----------------------------------\n")
    result.append(geo.copy())

도봉구
-----------------------------------
    cluster  count  alive   density
0        -1     93  False  0.000000
1         0      4  False  0.000050
2         1      2  False  0.000028
3         2     14  False  0.000077
4         3      8  False  0.000043
5         4      3  False  0.000045
6         5      2  False  0.000035
7         6      2  False  0.000030
8         7     15  False  0.000071
9         8      8  False  0.000074
10        9      4  False  0.000062
11       10      4  False  0.000038
12       11      2  False  0.000026
13       12      6  False  0.000055
14       13      7  False  0.000065
15       14      2  False  0.000025
16       15      6  False  0.000042
17       16      2  False  0.000027
18       17      9  False  0.000086
19       18      4  False  0.000048
20       19      7   True  0.000094
21       20      3  False  0.000043
22       21      4  False  0.000055
23       22      2  False  0.000031
24       23      4  False  0.000036
25       24      4  Fals

-----------------------------------
     cluster  count  alive   density
0         -1    152  False  0.000000
1          0      6  False  0.000058
2          1      2  False  0.000028
3          2      3  False  0.000034
4          3      2  False  0.000028
5          4      3  False  0.000040
6          5     17  False  0.000061
7          6      8  False  0.000068
8          7      2  False  0.000028
9          8      5  False  0.000075
10         9      5  False  0.000064
11        10      2  False  0.000030
12        11      6  False  0.000062
13        12      7  False  0.000044
14        13      4  False  0.000044
15        14      2  False  0.000038
16        15      2  False  0.000031
17        16      2  False  0.000029
18        17      3  False  0.000031
19        18      2  False  0.000034
20        19     12  False  0.000059
21        20      3  False  0.000033
22        21      2  False  0.000028
23        22     19  False  0.000069
24        23     19  False  0.000083
25

-----------------------------------
     cluster  count  alive   density
0         -1    138  False  0.000000
1          0      2  False  0.000027
2          1      6  False  0.000085
3          2      5  False  0.000039
4          3      7  False  0.000063
5          4      6  False  0.000064
6          5      2  False  0.000032
7          6      4  False  0.000043
8          7     22   True  0.000097
9          8      3  False  0.000051
10         9      3  False  0.000039
11        10      3  False  0.000041
12        11      2  False  0.000028
13        12      6  False  0.000051
14        13      2  False  0.000039
15        14      2  False  0.000026
16        15     11  False  0.000078
17        16      6  False  0.000053
18        17      6  False  0.000055
19        18     13  False  0.000086
20        19      2  False  0.000028
21        20      3  False  0.000034
22        21      2  False  0.000027
23        22      2  False  0.000029
24        23      2  False  0.000029
25

-----------------------------------
     cluster  count  alive   density
0         -1    222  False  0.000000
1          0     12  False  0.000074
2          1      2  False  0.000028
3          2     11  False  0.000050
4          3      8  False  0.000074
5          4      2  False  0.000032
6          5      2  False  0.000037
7          6      2  False  0.000027
8          7     16  False  0.000072
9          8      3  False  0.000037
10         9      4  False  0.000050
11        10      2  False  0.000029
12        11      2  False  0.000037
13        12      9  False  0.000064
14        13      5  False  0.000062
15        14     14  False  0.000073
16        15      2  False  0.000033
17        16      8  False  0.000069
18        17      3  False  0.000039
19        18      9  False  0.000088
20        19     11  False  0.000062
21        20      3  False  0.000048
22        21      4  False  0.000044
23        22      3  False  0.000037
24        23      6  False  0.000042
25

-----------------------------------
    cluster  count  alive   density
0        -1    149  False  0.000000
1         0      5  False  0.000039
2         1      7  False  0.000054
3         2      3  False  0.000040
4         3      7  False  0.000048
5         4      7  False  0.000058
6         5      2  False  0.000029
7         6      2  False  0.000028
8         7      2  False  0.000026
9         8      6  False  0.000078
10        9      3  False  0.000036
11       10      3  False  0.000035
12       11      2  False  0.000039
13       12      5  False  0.000054
14       13      8  False  0.000067
15       14      2  False  0.000027
16       15      3  False  0.000040
17       16      4  False  0.000053
18       17      3  False  0.000047
19       18      2  False  0.000026
20       19      2  False  0.000028
21       20      3  False  0.000040
22       21      7  False  0.000066
23       22      5  False  0.000055
24       23      2  False  0.000033
25       24      5  False  0

-----------------------------------
    cluster  count  alive   density
0        -1    116  False  0.000000
1         0      2  False  0.000035
2         1      5  False  0.000053
3         2      4  False  0.000047
4         3      6  False  0.000046
5         4      5  False  0.000045
6         5      3  False  0.000039
7         6      4  False  0.000057
8         7      6  False  0.000060
9         8     13   True  0.000093
10        9      7  False  0.000060
11       10      9  False  0.000059
12       11      8  False  0.000066
13       12      2  False  0.000034
14       13      3  False  0.000033
15       14      3  False  0.000045
16       15      2  False  0.000031
17       16      8  False  0.000051
18       17      2  False  0.000027
19       18      3  False  0.000040
20       19      4  False  0.000043
21       20      2  False  0.000032
22       21      3  False  0.000032
23       22      4  False  0.000040
24       23      2  False  0.000032
25       24      7   True  0

# 3. 지도 시각화

In [14]:
cluster = pd.concat(result).query(f"density > {criterion}")
acdnt_cls = gpd.sjoin(gdf[['geometry', 'acdnt_no']], cluster[['labels', 'geometry']],  op="within")
acdnt_cls = pd.merge(acdnt_cls, data)
acdnt_cls = acdnt_cls.assign(acdnt_dd_dc = acdnt_cls.acdnt_dd_dc.astype(str))

In [66]:
style_function = lambda x: {'color':'#FF0000','fillColor': '#FF0000'}

In [67]:
feauturegroup1 = FeatureGroup(name="사고발생좌표")
feauturegroup2 = FeatureGroup(name="어린이보호구역추천")
feauturegroup3 = FeatureGroup(name="스쿨존")

In [68]:
m = folium.Map(location=[37.53, 126.97], zoom_start=12) 

folium.features.GeoJson(cluster, style_function=style_function).add_to(feauturegroup2)
folium.features.GeoJson(acdnt_cls).add_to(feauturegroup1)

<folium.features.GeoJson at 0x26221931dd8>

In [30]:
df_safe=pd.read_csv("./data/전국어린이보호구역표준데이터.csv", encoding="cp949")
gdf_safe=gpd.GeoDataFrame(df_safe, geometry=gpd.points_from_xy(x=df_safe.경도, y=df_safe.위도))
gdf_safe["point"]=gdf_safe.geometry.apply(lambda x : list(reversed(str(x)[7:-1].split(" "))))
gdf_safe_서울 = gdf_safe[gdf_safe.제공기관명.str.contains(pat="서울")].reset_index()
gdf_safe_서울.head()

Unnamed: 0,index,시설종류,대상시설명,소재지도로명주소,소재지지번주소,위도,경도,관리기관명,관할경찰서명,CCTV설치여부,CCTV설치대수,보호구역도로폭,데이터기준일자,제공기관코드,제공기관명,geometry,point
0,39,유치원,신창,서울특별시 도봉구 덕릉로 272,,37.641337,127.040961,도봉구청,도봉경찰서,Y,1.0,8~25,2019-09-05,3090000,서울특별시 도봉구,POINT (127.04096 37.64134),"[37.6413368, 127.0409612]"
1,94,초등학교,후암초교,서울특벽시 용산구 두텁바위로 140,서울특별시 용산구 후암동 30-138,37.551365,126.981726,용산구청,용산경찰서,Y,,6~12,2020-02-18,3020000,서울특별시 용산구,POINT (126.98173 37.55137),"[37.5513654106, 126.9817257628]"
2,95,초등학교,청파초교(병설유치원),서울특벽시 용산구 효창원로 228,서울특별시 용산구 청파동2가 1-42,37.547303,126.963544,용산구청,용산경찰서,Y,,4~12,2020-02-18,3020000,서울특별시 용산구,POINT (126.96354 37.54730),"[37.5473030994, 126.9635440384]"
3,96,초등학교,이태원초교,서울특벽시 용산구 녹사평대로40길 19,서울특별시 용산구 이태원동 406,37.536478,126.987672,용산구청,용산경찰서,Y,,6~7,2020-02-18,3020000,서울특별시 용산구,POINT (126.98767 37.53648),"[37.5364780669, 126.9876717551]"
4,97,초등학교,남정초교(병설유치원),서울특벽시 용산구 원효로64길 17-10,서울특별시 용산구 원효로2가 54-1,37.535673,126.964803,용산구청,용산경찰서,Y,,6~8,2020-02-18,3020000,서울특별시 용산구,POINT (126.96480 37.53567),"[37.5356733769, 126.96480333]"


In [69]:
for i in range(len(gdf_safe_서울)):
    folium.Circle(location = gdf_safe_서울.point[i],
                 popup=[gdf_safe_서울.대상시설명[i],gdf_safe_서울.관리기관명[i], gdf_safe_서울.CCTV설치여부[i]],
                 radius = 300,
                fill_color='#FFFF00',
                  color = "#FAFAFA",
                  weight = 1
                  ).add_to(feauturegroup3)
    
feauturegroup1.add_to(m)
feauturegroup2.add_to(m)
feauturegroup3.add_to(m)

<folium.map.FeatureGroup at 0x2622197e9e8>

In [70]:
LayerControl().add_to(m)
plugins.ScrollZoomToggler().add_to(m)

<folium.plugins.scroll_zoom_toggler.ScrollZoomToggler at 0x26221b44f28>

In [71]:
m.save("tmp.html")