# 할 일

1. 구별 클러스터링
2. 클러스터별 면적 계산
3. 사고발생건수/면적 계산: 기준 이상만 남기기

In [267]:
import warnings
import pyproj
import numpy as np
import pandas as pd
import geopandas as gpd
from statsmodels.api import OLS
from sklearn.cluster import DBSCAN
from functools import partial
from shapely.ops import transform
from shapely.geometry import Point

In [187]:
warnings.filterwarnings(action="ignore")

In [152]:
proj_wgs84 = pyproj.Proj('+proj=longlat +datum=WGS84')
def geodesic_point_buffer(point, meter):
    # Azimuthal equidistant projection
    aeqd_proj = '+proj=aeqd +lat_0={lat} +lon_0={lon} +x_0=0 +y_0=0'
    project = partial(
        pyproj.transform,
        pyproj.Proj(aeqd_proj.format(lat=point.y, lon=point.x)),
        proj_wgs84
    )
    buf = Point(0, 0).buffer(meter)  # distance in metres
    return transform(project, buf)

In [183]:
def calc_meter_area(polygon):
    proj = partial(
        pyproj.transform, pyproj.Proj(init='epsg:4326'),
        pyproj.Proj(init='epsg:3857')
    )
    s_new = transform(proj, polygon)
    return transform(proj, polygon).area

In [5]:
plt.rcParams["font.family"] = "NanumGothic"
plt.rcParams['axes.unicode_minus'] = False

In [255]:
data = pd.read_json("data/kids-accident-pp.json")
gdf = gpd.read_file("data/geometry.geojson")
data = data.assign(
    sido = np.array([x.split()[0] for x in data.legaldong_name]),
    gugun = np.array([x.split()[1] for x in data.legaldong_name]),
    acdnt_dd_dc = pd.to_datetime(data.acdnt_dd_dc, format="%Y-%m-%d")
)

# 1. 기준 설정

In [388]:
acdnt_cnt = (data.groupby(data.acdnt_dd_dc.dt.year)
                 .size()
                 .reset_index()
                 .rename(columns={"acdnt_dd_dc":"year", 0:"cnt"})
                 .assign(constant=1)
            )

In [390]:
ols_model = OLS(acdnt_cnt.cnt.values,acdnt_cnt[["constant","year"]]).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.957
Model:,OLS,Adj. R-squared:,0.953
Method:,Least Squares,F-statistic:,245.5
Date:,"Sun, 14 Jun 2020",Prob (F-statistic):,7.2e-09
Time:,21:43:50,Log-Likelihood:,-91.486
No. Observations:,13,AIC:,187.0
Df Residuals:,11,BIC:,188.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,7.054e+05,4.47e+04,15.787,0.000,6.07e+05,8.04e+05
year,-347.7637,22.197,-15.667,0.000,-396.620,-298.908

0,1,2,3
Omnibus:,0.697,Durbin-Watson:,0.847
Prob(Omnibus):,0.706,Jarque-Bera (JB):,0.015
Skew:,-0.076,Prob(JB):,0.992
Kurtosis:,3.071,Cond. No.,1080000.0


In [413]:
summation = 3
criterion = 3
print("year","|","criterion")
print("====","|","="*20)
print(acdnt_cnt.year.values[-1],"|",criterion)
for i in range(acdnt_cnt.shape[0]-1):
    criterion = criterion/ols_model.fittedvalues.values[-(i+1)] * ols_model.fittedvalues.values[-(i+2)]
    summation += criterion
    print(acdnt_cnt.year.values[-(i+2)], "|", criterion)

year | criterion
2019 | 3
2018 | 3.3180883106787835
2017 | 3.636176621357567
2016 | 3.9542649320364567
2015 | 4.272353242715241
2014 | 4.590441553394024
2013 | 4.908529864072808
2012 | 5.226618174751698
2011 | 5.544706485430481
2010 | 5.862794796109265
2009 | 6.180883106788048
2008 | 6.498971417466832
2007 | 6.817059728145722


In [439]:
criterion = summation / (300 * 300 * np.pi)

In [440]:
criterion

0.00022568485078573245

# 2. 클러스터 선정

In [396]:
seoul = data.query("sido=='서울특별시'").drop("sido", axis=1)

In [397]:
earth_radius_km = 6371
epsilon = 0.2 / earth_radius_km #calculate 100 meterㄴ epsilon threshold

In [None]:
result = []
for gu in seoul.gugun.unique():
    print(gu)
    # 클러스터 모델 적합
    df = seoul[seoul.gugun==gu]
    geo = gdf[gdf.acdnt_no.isin(df.acdnt_no)]
    coordinates = np.array([geo.geometry.x, geo.geometry.y]).T
    X = np.radians(coordinates) # convert the list of lat/lon coordinates to radians
    model = DBSCAN(
        eps=epsilon,
        min_samples=summation/10,
        n_jobs=4
    )
    model.fit(X)
    
    # 밀도 계산
    geo = geo.assign(
        labels = model.labels_,
        geometry = geo.geometry.apply(geodesic_point_buffer, meter=100)
    ).query("labels!=-1")
    geo = pd.merge(
        geo.query("labels!=-1").dissolve(by="labels").reset_index(),
        geo.labels.value_counts().reset_index().rename(columns={"labels":"cls_size", "index":"labels"})
    )
    geo = geo.assign(meter_area = geo.geometry.apply(calc_meter_area))
    geo = geo[(geo.cls_size / geo.meter_area) > criterion]
    print("-----------------")
    print(geo)
    print("-----------------\n")
    
    result.append(geo.copy())
    