# LIBRARY

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.formula.api import glm
from statsmodels.genmod.families.family import Binomial
from sklearn.ensemble import RandomForestClassifier

# DATA

In [2]:
train = pd.read_csv('PJT002_train.csv')
val = pd.read_csv('PJT002_validation.csv')
test = pd.read_csv('PJT002_test.csv')
sub = pd.read_csv('PJT002_submission.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## N, Y -> 0, 1

In [3]:
binary_y = {'N': 0, 'Y': 1}

train['fr_yn'] = train['fr_yn'].map(binary_y)
val['fr_yn'] = val['fr_yn'].map(binary_y)
train['mlt_us_yn'] = train['mlt_us_yn'].map(binary_y)
val['mlt_us_yn'] = val['mlt_us_yn'].map(binary_y)
test['mlt_us_yn'] = test['mlt_us_yn'].map(binary_y)

### 필요한 열만 가져오기

In [4]:
columns = ['id', 'dt_of_fr', 'bldng_cnt', 'bldng_ar', 'ttl_ar', 'lnd_ar'
           , 'jmk', 'fr_sttn_dstnc', 'fr_wthr_fclt_dstnc'
           , 'mlt_us_yn', 'cctv_dstnc', 'cctv_in_100m'
           , 'fr_wthr_fclt_in_100m', 'tbc_rtl_str_dstnc', 'sft_emrgnc_bll_dstnc'
           , 'ahsm_dstnc', 'no_tbc_zn_dstnc', 'bldng_cnt_in_50m', 'fr_yn'
          ]
columns2 = ['id', 'dt_of_fr', 'bldng_cnt', 'bldng_ar', 'ttl_ar', 'lnd_ar'
           , 'jmk', 'fr_sttn_dstnc', 'fr_wthr_fclt_dstnc'
           , 'mlt_us_yn', 'cctv_dstnc', 'cctv_in_100m'
           , 'fr_wthr_fclt_in_100m', 'tbc_rtl_str_dstnc', 'sft_emrgnc_bll_dstnc'
           , 'ahsm_dstnc', 'no_tbc_zn_dstnc', 'bldng_cnt_in_50m'
          ]

train_pp = train[columns]
test_pp = test[columns2]
val_pp = val[columns]

# NULL CHECK
null 없는 애들로만 가져와서 null이 없음

In [5]:
train_pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59199 entries, 0 to 59198
Data columns (total 19 columns):
id                      59199 non-null int64
dt_of_fr                59199 non-null object
bldng_cnt               59199 non-null int64
bldng_ar                59199 non-null float64
ttl_ar                  59199 non-null float64
lnd_ar                  59199 non-null float64
jmk                     59199 non-null object
fr_sttn_dstnc           59199 non-null int64
fr_wthr_fclt_dstnc      59199 non-null int64
mlt_us_yn               59199 non-null int64
cctv_dstnc              59199 non-null int64
cctv_in_100m            59199 non-null int64
fr_wthr_fclt_in_100m    59199 non-null int64
tbc_rtl_str_dstnc       59199 non-null int64
sft_emrgnc_bll_dstnc    59199 non-null int64
ahsm_dstnc              59199 non-null int64
no_tbc_zn_dstnc         59199 non-null int64
bldng_cnt_in_50m        59199 non-null int64
fr_yn                   59199 non-null int64
dtypes: float64(3), int64(14

# EDA

In [6]:
train_pp.corr()

Unnamed: 0,id,bldng_cnt,bldng_ar,ttl_ar,lnd_ar,fr_sttn_dstnc,fr_wthr_fclt_dstnc,mlt_us_yn,cctv_dstnc,cctv_in_100m,fr_wthr_fclt_in_100m,tbc_rtl_str_dstnc,sft_emrgnc_bll_dstnc,ahsm_dstnc,no_tbc_zn_dstnc,bldng_cnt_in_50m,fr_yn
id,1.0,-0.001038,0.00547,-0.00087,0.000515,-0.000474,0.008163,-0.006766,-0.001483,0.001142,-0.007934,0.003033,-0.005578,0.001546,0.004541,0.001812,-0.002368
bldng_cnt,-0.001038,1.0,0.530414,0.189012,0.44332,0.009314,-0.014234,-0.012059,0.010925,-0.066528,-0.051694,0.019226,-0.011281,-0.003087,0.029456,-0.048917,0.056176
bldng_ar,0.00547,0.530414,1.0,0.329309,0.604804,-0.029663,-0.005821,-0.000105,-0.011676,-0.014838,-0.008633,-0.03115,-0.024136,0.034578,-0.014944,-0.009598,0.085169
ttl_ar,-0.00087,0.189012,0.329309,1.0,0.230771,-0.020504,-0.011536,0.001854,-0.011232,-0.006543,0.00459,-0.022649,-0.017894,0.026339,-0.015214,-0.003122,0.059955
lnd_ar,0.000515,0.44332,0.604804,0.230771,1.0,-0.004147,-0.005827,-0.000732,-0.000384,-0.003896,-0.002267,-0.007256,0.002419,-0.003103,-0.004577,0.000704,0.025034
fr_sttn_dstnc,-0.000474,0.009314,-0.029663,-0.020504,-0.004147,1.0,0.070991,-0.049488,0.363127,-0.237448,-0.220333,0.588492,0.317828,0.057742,0.178793,-0.284051,-0.070687
fr_wthr_fclt_dstnc,0.008163,-0.014234,-0.005821,-0.011536,-0.005827,0.070991,1.0,-0.032921,0.09849,0.076436,-0.285666,0.137121,-0.02092,-0.272471,0.001452,0.033513,-0.16528
mlt_us_yn,-0.006766,-0.012059,-0.000105,0.001854,-0.000732,-0.049488,-0.032921,1.0,-0.029144,0.05026,0.06469,-0.050321,-0.043983,0.041375,-0.035862,0.019793,0.075448
cctv_dstnc,-0.001483,0.010925,-0.011676,-0.011232,-0.000384,0.363127,0.09849,-0.029144,1.0,-0.188067,-0.155715,0.23732,0.320343,-0.11484,0.101578,-0.148197,-0.029258
cctv_in_100m,0.001142,-0.066528,-0.014838,-0.006543,-0.003896,-0.237448,0.076436,0.05026,-0.188067,1.0,0.246822,-0.229323,-0.134099,0.001951,-0.166905,0.212812,0.02754


### 화재 예측에는 년, 일, 분, 초는 필요하지 않을 것 같아서 일단 월, 시만 가져옴

In [7]:
train_pp['month'] = pd.to_datetime(train_pp['dt_of_fr']).dt.month
train_pp['hour'] = pd.to_datetime(train_pp['dt_of_fr']).dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
test_pp['month'] = pd.to_datetime(test_pp['dt_of_fr']).dt.month
test_pp['hour'] = pd.to_datetime(test_pp['dt_of_fr']).dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
val_pp['month'] = pd.to_datetime(val_pp['dt_of_fr']).dt.month
val_pp['hour'] = pd.to_datetime(val_pp['dt_of_fr']).dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# CONCAT DATA

In [10]:
train_except_fr_yn = train_pp.drop('fr_yn', axis=1)

In [11]:
train_except_fr_yn['tag'] = 'train'
test_pp['tag'] = 'test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
df = pd.concat([train_except_fr_yn, test_pp])

In [13]:
df.sample()

Unnamed: 0,id,dt_of_fr,bldng_cnt,bldng_ar,ttl_ar,lnd_ar,jmk,fr_sttn_dstnc,fr_wthr_fclt_dstnc,mlt_us_yn,...,cctv_in_100m,fr_wthr_fclt_in_100m,tbc_rtl_str_dstnc,sft_emrgnc_bll_dstnc,ahsm_dstnc,no_tbc_zn_dstnc,bldng_cnt_in_50m,month,hour,tag
26134,126587,2018-04-11 01:18:22,1,0.0,118.35,134.0,대,1075,5743,0,...,0,0,232,7837,766,209,0,4,1,train


In [14]:
df.shape

(62156, 21)

# MAKING DATASET

In [15]:
# choose independent variables(Xs) which are useful!
# 트레이닝에 사용할 변수 목록을 적어주세요.
independents = ['bldng_cnt', 'bldng_ar', 'ttl_ar', 'lnd_ar', 
#                 'jmk',
       'fr_sttn_dstnc', 'fr_wthr_fclt_dstnc', 'mlt_us_yn', 'cctv_dstnc',
       'cctv_in_100m', 'fr_wthr_fclt_in_100m', 'tbc_rtl_str_dstnc',
       'sft_emrgnc_bll_dstnc', 'ahsm_dstnc', 'no_tbc_zn_dstnc',
       'bldng_cnt_in_50m',  'month'] 
# 독립변수
dependent = ['fr_yn'] # 종속변수

In [16]:
# 적어준 변수 목록을 사용해 데이터를 트레이닝에 맞는 포맷으로 변경합니다.
train_X = df[df['tag'] == 'train'][independents]
train_y = train_pp[dependent]

test_X = df[df['tag'] == 'test'][independents]
val_X = val_pp[independents]

# MODEL FITTING

## 선형 회귀

In [17]:
reg = LinearRegression().fit(train_X, train_y)
# 대문자는 매트릭스, 소문자는 벡터

In [18]:
# x 계수
# x 계수의 의미를 해석하세요.
reg.coef_
# 피클래스가 한 계단 올라갈 때마다 죽음에 0.19배 가까워짐

array([[ 1.46096654e-03,  2.98387074e-06,  1.43195966e-07,
        -7.31062334e-09, -3.90604562e-06, -1.59676364e-05,
         3.76721867e-01,  3.91556052e-06,  4.90890359e-03,
         1.81215538e-02, -1.38894653e-06,  6.68506883e-07,
         2.54221356e-06, -3.15470967e-06,  3.72766000e-04,
        -2.05479150e-03]])

In [19]:
# 절편
reg.intercept_

array([0.14070614])

In [20]:
reg.score(train_X, train_y, sample_weight=None)

0.05455182781217504

## 일반화 선형모형

In [21]:
res = glm('fr_yn ~ bldng_cnt + bldng_ar + ttl_ar + lnd_ar +fr_sttn_dstnc + fr_wthr_fclt_dstnc + mlt_us_yn + cctv_dstnc + cctv_in_100m + fr_wthr_fclt_in_100m + tbc_rtl_str_dstnc + sft_emrgnc_bll_dstnc + ahsm_dstnc + no_tbc_zn_dstnc + bldng_cnt_in_50m + month', train_pp, family=Binomial()).fit()
res.summary()

  n_endog_mu = self._clean((1. - endog) / (1. - mu))
  resid_dev = endog * np.log(endog_mu) + (1 - endog) * np.log(n_endog_mu)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu)) +
  n * np.log(1 - mu)) * var_weights
  n * np.log(1 - mu)) * var_weights


0,1,2,3
Dep. Variable:,fr_yn,No. Observations:,59199.0
Model:,GLM,Df Residuals:,59182.0
Model Family:,Binomial,Df Model:,16.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,
Date:,"Fri, 18 Oct 2019",Deviance:,
Time:,23:50:23,Pearson chi2:,243000000.0
No. Iterations:,100,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.7875,0.044,-40.375,0.000,-1.874,-1.701
bldng_cnt,-0.0022,0.002,-0.868,0.385,-0.007,0.003
bldng_ar,2.404e-05,5.35e-06,4.492,0.000,1.36e-05,3.45e-05
ttl_ar,2.592e-05,1.94e-06,13.349,0.000,2.21e-05,2.97e-05
lnd_ar,-2.665e-07,3.42e-08,-7.791,0.000,-3.34e-07,-1.99e-07
fr_sttn_dstnc,-2.834e-05,3.55e-06,-7.984,0.000,-3.53e-05,-2.14e-05
fr_wthr_fclt_dstnc,-0.0003,9.71e-06,-29.705,0.000,-0.000,-0.000
mlt_us_yn,1.7198,0.153,11.235,0.000,1.420,2.020
cctv_dstnc,6.097e-05,1.13e-05,5.381,0.000,3.88e-05,8.32e-05


## 최소제곱추정 선형회귀

In [22]:
res = ols('fr_yn ~ bldng_cnt + bldng_ar + ttl_ar + lnd_ar +fr_sttn_dstnc + fr_wthr_fclt_dstnc + mlt_us_yn + cctv_dstnc + cctv_in_100m + fr_wthr_fclt_in_100m + tbc_rtl_str_dstnc + sft_emrgnc_bll_dstnc + ahsm_dstnc + no_tbc_zn_dstnc + bldng_cnt_in_50m + month', train_pp).fit()
res.summary()

0,1,2,3
Dep. Variable:,fr_yn,R-squared:,0.055
Model:,OLS,Adj. R-squared:,0.054
Method:,Least Squares,F-statistic:,213.4
Date:,"Fri, 18 Oct 2019",Prob (F-statistic):,0.0
Time:,23:50:24,Log-Likelihood:,-17700.0
No. Observations:,59199,AIC:,35430.0
Df Residuals:,59182,BIC:,35590.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1407,0.005,31.026,0.000,0.132,0.150
bldng_cnt,0.0015,0.000,6.522,0.000,0.001,0.002
bldng_ar,2.984e-06,2.02e-07,14.753,0.000,2.59e-06,3.38e-06
ttl_ar,1.432e-07,1.82e-08,7.885,0.000,1.08e-07,1.79e-07
lnd_ar,-7.311e-09,7.95e-10,-9.191,0.000,-8.87e-09,-5.75e-09
fr_sttn_dstnc,-3.906e-06,3.62e-07,-10.785,0.000,-4.62e-06,-3.2e-06
fr_wthr_fclt_dstnc,-1.597e-05,5.72e-07,-27.933,0.000,-1.71e-05,-1.48e-05
mlt_us_yn,0.3767,0.024,15.774,0.000,0.330,0.424
cctv_dstnc,3.916e-06,1.13e-06,3.479,0.001,1.71e-06,6.12e-06

0,1,2,3
Omnibus:,21746.722,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,59839.545
Skew:,2.05,Prob(JB):,0.0
Kurtosis:,5.728,Cond. No.,38400000.0


## 값 예측하기

In [23]:
prediction = res.predict(test_X)

In [24]:
sorted(prediction, reverse=True)

[1.734582308571622,
 1.73047272556347,
 0.7016243522487121,
 0.6624281818007873,
 0.5867363795881613,
 0.5820759235800305,
 0.581886034131555,
 0.5790322755710595,
 0.5748110964896079,
 0.5711764220488625,
 0.5530899267082159,
 0.5410385720522811,
 0.5129506209332799,
 0.47312483562392665,
 0.39404310541067605,
 0.38703438476349494,
 0.38703438476349494,
 0.3788152187471907,
 0.353639015461814,
 0.34413521935142566,
 0.3188939175595249,
 0.3093381880777406,
 0.2942015629158186,
 0.29335405557578,
 0.29294496139675263,
 0.28682975197843946,
 0.28041047785305,
 0.27912927467880333,
 0.2786105859621352,
 0.2768523382337378,
 0.27424610334082183,
 0.27302382084137655,
 0.27274275522558566,
 0.2699563452024547,
 0.2673927952202655,
 0.26656153605033817,
 0.2655591763536638,
 0.26465224070647747,
 0.2619173543163655,
 0.26153729838915646,
 0.2614541403300189,
 0.259628146706072,
 0.2575747683476269,
 0.25487680105309485,
 0.2544537501323935,
 0.2528220095490188,
 0.25248547054868187,
 0.2514

In [25]:
prediction_binomial = [1 if p>=0.3 else 0 for p in prediction]

# VALIDATION 파일 이용하기

In [82]:
prediction_val = res.predict(val_X)
sorted(prediction_val, reverse=True)

[1.7325275170675458,
 1.7284179340593937,
 1.7263631425553179,
 1.7181439765390136,
 1.7160891850349373,
 0.7180626842813207,
 0.7160078927772446,
 0.7160078927772446,
 0.7160078927772446,
 0.7057339352568643,
 0.7036791437527882,
 0.6583185987926352,
 0.6562638072885592,
 0.652154224280407,
 0.650099432776331,
 0.6480446412722549,
 0.6439350582641028,
 0.6134751064312031,
 0.5998602439938306,
 0.5916410779775264,
 0.5888985933150195,
 0.5834219119612222,
 0.5735620102912458,
 0.5734090218884814,
 0.5699761815337033,
 0.5625278086257941,
 0.557105874079598,
 0.5544943961873141,
 0.5542724701903399,
 0.5447187215898679,
 0.4990539955746361,
 0.4875083761524591,
 0.48545358464838306,
 0.483398793144307,
 0.48134400164023095,
 0.47928921013615483,
 0.39772355653155056,
 0.39772355653155056,
 0.3952535507797992,
 0.39361397352339844,
 0.389089176267571,
 0.38744959901117026,
 0.38744959901117026,
 0.3853948075070942,
 0.36185818147811827,
 0.3515842239577379,
 0.3515842239577379,
 0.344135

In [83]:
prediction_binomial_val = [1 if p>=0.3 else 0 for p in prediction_val]

# RANDOM FOREST

In [67]:
forest = RandomForestClassifier(random_state=42, n_estimators=10)

In [75]:
forest = forest.fit(train_X, train_y)

  """Entry point for launching an IPython kernel.


In [76]:
forest.feature_importances_

array([0.03456076, 0.10893222, 0.12893015, 0.06014744, 0.07839941,
       0.08808756, 0.001069  , 0.07115099, 0.01027784, 0.01015478,
       0.08107809, 0.08308581, 0.08131593, 0.08499369, 0.02806822,
       0.04974812])

In [77]:
predict_forest = forest.predict(val_X)

In [78]:
predict_forest

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)

# F1 구하기

In [79]:
import sklearn.metrics as metrics

In [84]:
print('f1', metrics.f1_score(prediction_binomial_val,val_pp['fr_yn']) )

f1 0.07501875468867217


In [81]:
print('f1', metrics.f1_score(predict_forest,val_pp['fr_yn']) )

f1 0.3399289700659564


# 한 번에 모델 돌리기

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [32]:
def train_and_test(model):
    a = model.fit(train_X, train_y)
    prediction = a.predict(test_X)
    accuracy = round(model.score(train_X, train_y) * 100, 2)
    print("Accuracy : ", accuracy, "%")
    return prediction

In [33]:
log_pred = train_and_test(LogisticRegression())

  y = column_or_1d(y, warn=True)


Accuracy :  87.5 %


In [34]:
# svm_pred = train_and_test(SVC())

In [35]:
# knn_pred_4 = train_and_test(KNeighborsClassifier(n_neighbors = 4))

In [36]:
rf_pred = train_and_test(RandomForestClassifier(n_estimators=100))

  


Accuracy :  99.94 %


In [37]:
# nb_pred = train_and_test(GaussianNB())

# VALIDATION 한 번에 돌리기

In [85]:
def train_and_val(model):
    a = model.fit(train_X, train_y)
    prediction_val = a.predict(val_X)
    prediction_binomial_val = [1 if p>=0.3 else 0 for p in prediction_val]
    print('f1:', metrics.f1_score(prediction_binomial_val,val_pp['fr_yn']) )
    return prediction_val

In [86]:
train_and_val(LogisticRegression())

  y = column_or_1d(y, warn=True)


f1: 0.20277777777777778


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [87]:
train_and_val(RandomForestClassifier(n_estimators=100))

  


f1: 0.383634431455898


array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [88]:
train_and_val(KNeighborsClassifier(n_neighbors = 4))

  


f1: 0.31120552310143385


array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [89]:
train_and_val(GaussianNB())

f1: 0.20207612456747404


  y = column_or_1d(y, warn=True)


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
train_and_val(SVC())

  y = column_or_1d(y, warn=True)
