In [1]:
import pandas as pd
import folium
from haversine import haversine
import numpy as np
import seaborn as sns
import json
import os
from copy import deepcopy 
import geopandas as gpd
from shapely.geometry import Point
sns.set()

In [2]:
# Local
# Font
import matplotlib
from matplotlib import font_manager, rc
import platform
if platform.system()=="Windows":
    font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
matplotlib.rcParams['axes.unicode_minus']=False

import warnings
warnings.filterwarnings("ignore")

In [3]:
path = "D:\COMPAS\SBJ_2012_001"

In [22]:
total = gpd.read_file(os.path.join(path,'total.geojson'))
total.head(1)

Unnamed: 0,gid,accident_cnt,유치원_count_300,초등학교_count_300,방지턱_count_300,불법주정차_cam_count_300,정류장_count_100,정류장_count_300,유동인구_sum,유소년인구_count_200,...,운산초통학구역,운암초통학구역,원동초통학구역,필봉초통학구역,화성초통학구역,주정차_count_300,총인구_count_100,생산가능인구_count_100,고령인구_count_100,geometry
0,다사561083,1,0,0,0,0,0,2,43.435676,0.0,...,0,0,0,0,0,2,0.0,0.0,0.0,"MULTIPOLYGON (((127.00549 37.17243, 127.00549 ..."


# X ,Y 생성

In [10]:
X = pd.DataFrame(total.drop(["gid","accident_cnt","geometry"],axis=1))
cols = X.columns
Y = pd.Series(total["accident_cnt"])
X.shape , Y.shape

((345, 12), (345,))

# Train-Test split

In [11]:
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
m_scaler = MinMaxScaler()
X = scaler.fit_transform(X)

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25)
x_train.shape , x_test.shape

((258, 12), (87, 12))

# Linear Regressor

In [12]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(x_train, y_train)

score = LR.score(x_test,y_test)
print(score)

0.013902144370005254


In [13]:
print(LR.predict(x_test[0].reshape(1,-1)))
print(LR.coef_)

[1.77685988]
[ 0.18314508  0.03755685  0.00848777  0.0356217  -0.07755756  0.07159313
 -0.01661443  0.1586965  -0.07861593  0.04516493 -0.11837979  0.10907059]


# AdaBoostRegressor

In [19]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(n_estimators=20,learning_rate=0.1,loss="square",random_state=42)
ada.fit(x_train, y_train)

score = ada.score(x_test,y_test)
print(score)

-0.022250404533341595


In [15]:
top3 = np.argsort(ada.feature_importances_)[::-1][:3]
print("Top3 주요 특성 :",cols[top3].values)

Top3 주요 특성 : ['유동인구_sum' '학원수_count_300' '어린이집_count']


In [20]:
total = gpd.read_file(os.path.join(path,'total.geojson'))
X = pd.DataFrame(total.drop(["gid","accident_cnt","geometry"],axis=1))
cols = X.columns
Y = pd.Series(total["accident_cnt"])
X.shape , Y.shape
scaler = StandardScaler()
m_scaler = MinMaxScaler()
X = scaler.fit_transform(X)

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25)
x_train.shape , x_test.shape
LR = LinearRegression()
LR.fit(x_train, y_train)

score = LR.score(x_test,y_test)
print(score)
print(LR.predict(x_test[0].reshape(1,-1)))
print(LR.coef_)

-8.950593560226125e+25
[1.75805664]
[ 4.24967651e-02  2.47747670e-02  9.17837654e-02 -2.23965110e-02
  1.99963541e-01  1.26636423e-01 -2.36317800e-02 -1.71610044e-01
  3.02975235e-02  6.39971923e-02 -1.13350670e-01  8.64967542e-02
 -1.36347176e-01 -9.46688244e-02 -1.85521148e-01 -2.50834178e-01
  9.35952732e+12 -2.65364356e-01 -7.03386555e-02 -3.62951799e+12
  8.44836743e-02 -3.15113633e-02  1.90612388e+12  8.66332292e+12
 -3.74158043e-01 -5.31738583e+12  1.04416693e-02  5.78566293e+11
 -9.74256425e-02 -8.40385107e-02  1.90612388e+12 -7.28611763e-02
 -2.14273137e-01 -7.12890625e-02 -7.61718750e-02  4.99114990e-01
  7.83081055e-02  4.34797012e+12 -7.67420301e+12 -4.99895550e+12
  1.15966797e-01  1.39282227e-01  1.17614746e-01 -5.78566293e+11
  3.62548828e-02 -5.94831973e+12  6.16455078e-02  9.32617188e-02
  2.08740234e-02  1.61621094e-01  1.07299805e-01  3.07895764e+12
  1.53808594e-02  2.49206543e-01  2.09960938e-02  8.88671875e-02
  8.78906250e-03  1.90429688e-02  1.73583984e-01 -2.29

In [21]:
ada = AdaBoostRegressor(n_estimators=20,learning_rate=0.1,loss="square",random_state=42)
ada.fit(x_train, y_train)

score = ada.score(x_test,y_test)
print(score)
top3 = np.argsort(ada.feature_importances_)[::-1][:3]
print("Top3 주요 특성 :",cols[top3].values)

0.11217197855221372
Top3 주요 특성 : ['신호등_count_100' '정류장_count_100' '횡단보도_count_100']
