In [1]:
import pandas as pd
import folium
from haversine import haversine
import numpy as np
import seaborn as sns
import json
import os
from copy import deepcopy 
import geopandas as gpd
from shapely.geometry import Point
sns.set()

In [2]:
# Local
# Font
import matplotlib
from matplotlib import font_manager, rc
import platform
if platform.system()=="Windows":
    font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
matplotlib.rcParams['axes.unicode_minus']=False

import warnings
warnings.filterwarnings("ignore")

In [3]:
path = "D:\COMPAS\SBJ_2012_001"

In [22]:
total = gpd.read_file(os.path.join(path,'total.geojson'))
total.head(1)

Unnamed: 0,gid,accident_cnt,유치원_count_300,초등학교_count_300,방지턱_count_300,불법주정차_cam_count_300,정류장_count_100,정류장_count_300,유동인구_sum,유소년인구_count_200,...,운산초통학구역,운암초통학구역,원동초통학구역,필봉초통학구역,화성초통학구역,주정차_count_300,총인구_count_100,생산가능인구_count_100,고령인구_count_100,geometry
0,다사561083,1,0,0,0,0,0,2,43.435676,0.0,...,0,0,0,0,0,2,0.0,0.0,0.0,"MULTIPOLYGON (((127.00549 37.17243, 127.00549 ..."


# X ,Y 생성

In [10]:
X = pd.DataFrame(total.drop(["gid","accident_cnt","geometry"],axis=1))
cols = X.columns
Y = pd.Series(total["accident_cnt"])
X.shape , Y.shape

((345, 12), (345,))

# Train-Test split

In [11]:
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
m_scaler = MinMaxScaler()
X = scaler.fit_transform(X)

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25)
x_train.shape , x_test.shape

((258, 12), (87, 12))

# Validation func gen

In [23]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,make_scorer
from sklearn.model_selection import cross_val_score

In [24]:
my_scorer = make_scorer(mean_squared_error,greater_is_better=False)

kf = KFold(n_splits=10,shuffle=True,random_state=2021)

__usage__  
score = cross_val_score(best_model,rand_xtest,rand_ytest,cv=kf,scoring=my_scorer)  
score.mean()*(-1)

# Linear Regressor

In [12]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(x_train, y_train)

score = LR.score(x_test,y_test)
print(score)

0.013902144370005254


In [13]:
print(LR.predict(x_test[0].reshape(1,-1)))
print(LR.coef_)

[1.77685988]
[ 0.18314508  0.03755685  0.00848777  0.0356217  -0.07755756  0.07159313
 -0.01661443  0.1586965  -0.07861593  0.04516493 -0.11837979  0.10907059]


# AdaBoostRegressor

In [39]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(n_estimators=200,learning_rate=0.1,loss="square",random_state=2021)
ada.fit(x_train, y_train)

Rscore = ada.score(x_test,y_test)
print("Rscore :",Rscore)

score = cross_val_score(ada,x_test,y_test,cv=kf,scoring=my_scorer)  
print("MSE :",score.mean()*(-1))

Rscore : 0.11162321677920595
MSE : 0.8162168627606896


In [40]:
top3 = np.argsort(ada.feature_importances_)[::-1][:3]
print("Top3 주요 특성 :",cols[top3].values)

Top3 주요 특성 : ['신호등_count_100' '정류장_count_100' '어린이집_count']


# SVR

In [37]:
from sklearn.svm import LinearSVR

In [38]:
svr = LinearSVR()
svr.fit(x_train,y_train)

Rscore = svr.score(x_test,y_test)
print("Rscore :",Rscore)
score = cross_val_score(svr,x_test,y_test,cv=kf,scoring=my_scorer)  
print("val MSE :",score.mean()*(-1))

Rscore : -0.06599364532167407
val MSE : 1.6708432345578181


# LGBM