# LIBRARY

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.formula.api import glm
from statsmodels.genmod.families.family import Binomial
from sklearn.ensemble import RandomForestClassifier

In [2]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [3]:
import scipy as sp

In [4]:
# imbalanced data 해결

from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.combine import *

In [5]:
from matplotlib import font_manager, rc
[font for font in font_manager.fontManager.ttflist if 'Malgun Gothic' in font.name]
import seaborn as sns

plt.rcParams["font.family"] = 'NanumSquareRoundOTF'
plt.rcParams["font.size"] = 12

In [6]:
import warnings
warnings.filterwarnings('ignore')

# DATA

In [7]:
train = pd.read_csv('train_PCA_fill_ar.csv')
valid = pd.read_csv('valid_PCA_fill_ar.csv')
test = pd.read_csv('test_PCA_fill_ar.csv')

In [8]:
train.sample()

Unnamed: 0,address,address1,address2,ahsm_dstnc,bldng_ar,bldng_cnt,bldng_cnt_in_50m,cctv_dstnc,cctv_in_100m,day,...,lnd_us_sttn_nm_체,lnd_us_sttn_nm_과,lnd_us_sttn_nm_묘,lnd_us_sttn_nm_공,lnd_us_sttn_nm_원,lnd_us_sttn_nm_사,lnd_us_sttn_nm_제,lnd_us_sttn_nm_수,ttl_dwn_flr,ttl_grnd_flr
31416,창원시 봉림동,창원시,봉림동,118,[95.5],2,0,155,0,13,...,False,False,False,False,False,False,False,False,0.0,0.0


## f1 score 평가 함수

In [9]:
def evaluation(y, pred):
    confusion = confusion_matrix(y, pred)
    accuracy = accuracy_score(y, pred)
    precision = precision_score(y, pred)
    recall = recall_score(y, pred)
    f1 = f1_score(y, pred)
    roc_score = roc_auc_score(y, pred)
    
    print('정확도(accuracy): {0: .4f}, 정밀도(precision): {1: .4f}, 재현율(recall): {2: 4f}, f1 score: {3: .4f}, auc값: {4: .4f}'.format(accuracy, precision, recall, f1, roc_score))

## Outlier 제거

In [10]:
# 창원시 전체 화재발생 0이라 제외
train = train[train['address1'] != '창원시']

# MAKING DATASET

In [11]:
train.columns

Index(['address', 'address1', 'address2', 'ahsm_dstnc', 'bldng_ar',
       'bldng_cnt', 'bldng_cnt_in_50m', 'cctv_dstnc', 'cctv_in_100m', 'day',
       'dayofweek', 'fire_yes', 'fr_mn_cnt', 'fr_sttn_dstnc',
       'fr_wthr_fclt_dstnc', 'fr_wthr_fclt_in_100m', 'hm_cnt', 'hmdt', 'hour',
       'id', 'jmk', 'lnd_ar', 'minute', 'mlt_us_yn(encode)', 'month',
       'no_tbc_zn_dstnc', 'second', 'sft_emrgnc_bll_dstnc', 'tag',
       'tbc_rtl_str_dstnc', 'tmprtr', 'ttl_ar', 'wnd_drctn', 'wnd_spd', 'year',
       'year-month', 'year-month-day', 'cluster', 'Comp1', 'Comp2', 'jmk_주',
       'jmk_잡', 'jmk_종', 'jmk_차', 'jmk_답', 'jmk_장', 'jmk_양', 'jmk_전', 'jmk_창',
       'jmk_학', 'jmk_목', 'jmk_도', 'jmk_임', 'jmk_철', 'jmk_유', 'jmk_천', 'jmk_구',
       'jmk_체', 'jmk_과', 'jmk_묘', 'jmk_공', 'jmk_원', 'jmk_사', 'jmk_제', 'jmk_수',
       'lnd_us_sttn_nm(clean)', 'lnd_us_sttn_nm_주', 'lnd_us_sttn_nm_잡',
       'lnd_us_sttn_nm_종', 'lnd_us_sttn_nm_차', 'lnd_us_sttn_nm_답',
       'lnd_us_sttn_nm_장', 'lnd_us_sttn_nm_양

In [12]:
train['dayofweek(int)'] = pd.to_datetime(train['year-month-day']).dt.dayofweek
test['dayofweek(int)'] = pd.to_datetime(test['year-month-day']).dt.dayofweek
valid['dayofweek(int)'] = pd.to_datetime(valid['year-month-day']).dt.dayofweek

In [13]:
# choose independent variables(Xs) which are useful!
# 트레이닝에 사용할 변수 목록을 적어주세요.

float_col = ['year', 'month', 'dayofweek(int)', 'hour'
             , 'lnd_ar', 'ttl_ar', 'bldng_ar', 'ttl_dwn_flr', 'ttl_grnd_flr'
             , 'tmprtr', 'wnd_spd', 'hmdt', 'wnd_drctn'
             , 'Comp1', 'Comp2'
            , 'hm_cnt'
             
#              , 'no_tbc_zn_dstnc', 'bldng_cnt_in_50m', 'fr_wthr_fclt_in_100m', 'cctv_in_100m'
             , 'fr_sttn_dstnc', 'fr_mn_cnt', 'fr_wthr_fclt_dstnc'
             , 'cctv_dstnc', 'tbc_rtl_str_dstnc', 'sft_emrgnc_bll_dstnc', 'ahsm_dstnc']

bool_col =['lnd_us_sttn_nm_주', 'lnd_us_sttn_nm_잡', 'lnd_us_sttn_nm_종', 'lnd_us_sttn_nm_차', 'lnd_us_sttn_nm_답', 'lnd_us_sttn_nm_장', 'lnd_us_sttn_nm_양', 'lnd_us_sttn_nm_전', 'lnd_us_sttn_nm_창', 'lnd_us_sttn_nm_학', 'lnd_us_sttn_nm_목', 'lnd_us_sttn_nm_도', 'lnd_us_sttn_nm_임', 'lnd_us_sttn_nm_철', 'lnd_us_sttn_nm_유', 'lnd_us_sttn_nm_천', 'lnd_us_sttn_nm_구', 'lnd_us_sttn_nm_체', 'lnd_us_sttn_nm_과', 'lnd_us_sttn_nm_묘', 'lnd_us_sttn_nm_공', 'lnd_us_sttn_nm_원', 'lnd_us_sttn_nm_사', 'lnd_us_sttn_nm_제', 'lnd_us_sttn_nm_수'
          , 'jmk_주', 'jmk_잡', 'jmk_종', 'jmk_차', 'jmk_답', 'jmk_장', 'jmk_양', 'jmk_전', 'jmk_창', 'jmk_학', 'jmk_목', 'jmk_도', 'jmk_임', 'jmk_철', 'jmk_유', 'jmk_천', 'jmk_구', 'jmk_체', 'jmk_과', 'jmk_묘', 'jmk_공', 'jmk_원', 'jmk_사', 'jmk_제', 'jmk_수'
           , 'ttl_dwn_flr', 'ttl_grnd_flr'
          ]

independents = float_col + bool_col


# 문자열 제외 'dayofweek', 'year-month','year-month-day', 'address', 'address1', 'address2', 'jmk'
# 관련 없는 열 제외 'id', 'second'(second가 0인게 많아서 fire_yes와 상관관계 높음) 
# 'fr_wthr_fclt_in_100m','ahsm_dstnc' 지역간 편차가 큰 애들 제외
# 독립변수
dependent = ['fire_yes'] # 종속변수

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50749 entries, 0 to 59190
Data columns (total 94 columns):
address                  50749 non-null object
address1                 50749 non-null object
address2                 50749 non-null object
ahsm_dstnc               50749 non-null int64
bldng_ar                 50749 non-null object
bldng_cnt                50749 non-null int64
bldng_cnt_in_50m         50749 non-null int64
cctv_dstnc               50749 non-null int64
cctv_in_100m             50749 non-null int64
day                      50749 non-null int64
dayofweek                50749 non-null object
fire_yes                 50749 non-null float64
fr_mn_cnt                50749 non-null float64
fr_sttn_dstnc            50749 non-null int64
fr_wthr_fclt_dstnc       50749 non-null int64
fr_wthr_fclt_in_100m     50749 non-null int64
hm_cnt                   50749 non-null float64
hmdt                     50749 non-null float64
hour                     50749 non-null int64
id  

In [14]:
# 적어준 변수 목록을 사용해 데이터를 트레이닝에 맞는 포맷으로 변경합니다.
train_X = train[independents]
train_y = train[dependent]

test_X = test[independents]
valid_X = valid[independents]

In [15]:
# logistic

model_lr = LogisticRegression().fit(train_X, train_y)
pred_lr = model_lr.predict(valid_X)

print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

ValueError: could not convert string to float: '[1146.7]'

# Undersampling

### Random Under Sampling

In [None]:
sample_X, sample_y = RandomUnderSampler(random_state=17).fit_sample(train_X, train_y)

print(sample_X.shape)

In [None]:
data = pd.DataFrame(sample_X, columns=independents)

for x in float_col:
    data[x] = data[x].apply(lambda x: float(x))
    
for x in bool_col:
    data[x] = data[x].apply(lambda x: True if x == 'True' else False)

data.dtypes

In [None]:
# XGB 모델 예측, 검증
model_xgb = XGBClassifier(verbose = True).fit(data, pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

print(classification_report(valid['fire_yes'], pred_xgb))

evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

### Tomek’s link method

In [None]:
sample_X, sample_y = TomekLinks(random_state=17).fit_sample(train_X, train_y)

print(sample_X.shape)

# max_depth=10, n_estimator=50  learning_rate=0.05

In [None]:
# XGB 모델 예측, 검증
# 최적값 max_depth = 5, n_estimator=50 0.4451
# 최적값 max_depth = 5, n_estimator=100 0.4510
# 최적값 max_depth = 5, n_estimator=130 0.4506
# 최적값 max_depth = 5, n_estimator=148 0.4524
# 최적값 max_depth = 5, n_estimator=149 0.4532 <
# 최적값 max_depth = 5, n_estimator=150 0.4524
# 최적값 max_depth = 5, n_estimator=155 0.4517
# 최적값 max_depth = 5, n_estimator=170 0.4502
# 최적값 max_depth = 5, n_estimator=200 0.4471


# 최적값 max_depth = 6, n_estimator=149 0.4485 
# 최적값 max_depth = 5, n_estimator=149 0.4532 <
# 최적값 max_depth = 4, n_estimator=149 0.4531

# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.1 0.4510
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.07 0.4531 
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.065 0.4561
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.061 0.4585
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.06 0.4595 <<<
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.059 0.4537
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.055 0.4576
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.05 0.4532
# 최적값 max_depth = 4, n_estimator=149, learning_rate=0.01 0.4348

# 최적값 max_depth =5, n_estimator=149, learning_rate=0.06 0.4533

# n_estimator 는 작아질수록 f1 score 올라감
model_xgb = XGBClassifier(max_depth=5, n_estimators=149, learning_rate=0.06, verbosity = 1).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

print(classification_report(valid['fire_yes'], pred_xgb))

evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

### Condensed Nearest Neighbour

In [None]:
# 오래걸린다
# sample_X, sample_y = CondensedNearestNeighbour(random_state=17).fit_sample(train_X, train_y)

# print(sample_X.shape)

# # XGB 모델 예측, 검증
# model_xgb = XGBClassifier(verbose = True).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

# pred_xgb = model_xgb.predict(valid_X)

# print(classification_report(valid['fire_yes'], pred_xgb))

# evaluation(valid['fire_yes'], pred_xgb)

### One Sided Selection
Tomek's Link + Condensed Nearest Neighbour

In [None]:
# sampling
sample_X, sample_y = OneSidedSelection(random_state=17).fit_sample(train_X, train_y)

print(sample_X.shape)

# XGB 모델 예측
model_xgb = XGBClassifier(verbose = True).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

# Confusion Matrix
print(classification_report(valid['fire_yes'], pred_xgb))

# 검증
evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

## OverSampling

### Random Over Sampling

In [None]:
# sampling
sample_X, sample_y = RandomOverSampler(random_state=17).fit_sample(train_X, train_y)
print(sample_X.shape)
# dtype 맞춰주기
data = pd.DataFrame(sample_X, columns=independents)

for x in float_col:
    data[x] = data[x].apply(lambda x: float(x))
    
for x in bool_col:
    data[x] = data[x].apply(lambda x: True if x == 'True' else False)

data.dtypes

# XGB 모델 예측
model_xgb = XGBClassifier(verbose = True).fit(data, pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

# Confusion Matrix
print(classification_report(valid['fire_yes'], pred_xgb))

# 검증
evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

### ADASYN

In [None]:
# sampling
sample_X, sample_y = ADASYN(random_state=17).fit_sample(train_X, train_y)
print(sample_X.shape)

# XGB 모델 예측
model_xgb = XGBClassifier(verbose = True).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

# Confusion Matrix
print(classification_report(valid['fire_yes'], pred_xgb))

# 검증
evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

### SMOTE

In [None]:
# sampling
sample_X, sample_y = SMOTE(random_state=17).fit_sample(train_X, train_y)
print(sample_X.shape)
# XGB 모델 예측
model_xgb = XGBClassifier(verbose = True).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

# Confusion Matrix
print(classification_report(valid['fire_yes'], pred_xgb))

# 검증
evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
print(classification_report(valid['fire_yes'], pred_lr))

evaluation(valid['fire_yes'], pred_lr)

## 복합 샘플링

### SMOTE + ENN

In [None]:
# sampling
sample_X, sample_y = SMOTEENN(random_state=17).fit_sample(train_X, train_y)
print(sample_X.shape)
# XGB 모델 예측
model_xgb = XGBClassifier(verbose = True).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

# Confusion Matrix
print(classification_report(valid['fire_yes'], pred_xgb))

# 검증
evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
evaluation(valid['fire_yes'], pred_lr)

### SMOTE + Tomek

In [None]:
# sampling
sample_X, sample_y = SMOTETomek(random_state=17).fit_sample(train_X, train_y)
print(sample_X.shape)
# XGB 모델 예측
model_xgb = XGBClassifier(verbose = True).fit(pd.DataFrame(sample_X, columns=independents), pd.DataFrame(sample_y)[0])

pred_xgb = model_xgb.predict(valid_X)

# Confusion Matrix
print(classification_report(valid['fire_yes'], pred_xgb))

# 검증
evaluation(valid['fire_yes'], pred_xgb)

In [None]:
# logistic

model_lr = LogisticRegression().fit(sample_X, sample_y)
pred_lr = model_lr.predict(valid_X)
evaluation(valid['fire_yes'], pred_lr)

# 1. XGBoost & ROC 커브

In [None]:
# model_xgb = XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, verbose = True).fit(train_X, train_y)
model_xgb = XGBClassifier().fit(train_X, train_y) 
# 좋은 파라미터를 찾아봐야할 것 같음

In [None]:
model_xgb.feature_importances_

### column 중요도

In [None]:
feature_importance = list(model_xgb.feature_importances_)
key = [ i for i in range(1, 23)]
dictionary = dict(zip(key, feature_importance))

In [None]:
plt.bar(dictionary.keys(), dictionary.values())
plt.show()

3, 6, 8, 13, 16, 19 열이 화재 발생과 관련이 있음

In [None]:
# 검증
pred_xgb = model_xgb.predict(valid_X)

print(classification_report(valid['fire_yes'], pred_xgb))

evaluation(valid['fire_yes'], pred_xgb)

### ROC 그리기

In [None]:
fpr, tpr, thresholds = roc_curve(valid['fire_yes'], model_xgb.predict_proba(valid_X)[:, 1])

In [None]:
plt.plot(fpr, tpr, 'o-', ms=2, label="XGBoost")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.legend()
plt.xlabel('위양성률(Fall-Out)')
plt.ylabel('재현률(Recall)')
plt.title('ROC curve')
plt.show()

In [None]:
auc(fpr1, tpr1) # ROC 커브 면적

# XGBoost DMatrix

In [None]:
dtrain = xgb.DMatrix(train_X, label=train_y)

In [None]:
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }

In [None]:
plst = param.items()

In [None]:
bst = xgb.train(plst,dtrain,)

In [None]:
dtest= xgb.DMatrix(valid_X)

In [None]:
ypred = bst.predict(dtest)

In [None]:
prediction_binomial_val = [1 if p>=0.4 else 0 for p in ypred]

In [None]:
print('f1', metrics.f1_score(prediction_binomial_val,valid['fire_yes']) )

In [None]:
evaluation(valid['fire_yes'], prediction_binomial_val)

# 클러스터링한 데이터로 모델링

In [None]:
train_c = pd.read_csv("train_3rd_cluster.csv")
test_c = pd.read_csv("test_3rd_cluster.csv")
valid_c = pd.read_csv("valid_3rd_cluster.csv")

In [None]:
# 적어준 변수 목록을 사용해 데이터를 트레이닝에 맞는 포맷으로 변경합니다.
train_c_X = train_c[independents]
train_c_y = train_c[dependent]

test_c_X = test_c[independents]
valid_c_X = valid_c[independents]

In [None]:
dtrain_c = xgb.DMatrix(train_c_X, label=train_c_y)

In [None]:
bst_c = xgb.train(plst,dtrain_c,)

In [None]:
dtest_c= xgb.DMatrix(valid_c_X)

In [None]:
ypred_c = bst_c.predict(dtest_c)

In [None]:
prediction_binomial_val_c = [1 if p>=0.4 else 0 for p in ypred_c]

In [None]:
evaluation(valid_c['fire_yes'], prediction_binomial_val_c)

# PCA한 걸로 모델링

In [None]:
train_PCA = pd.read_csv("train_PCA.csv")
valid_PCA = pd.read_csv("valid_PCA.csv")

In [None]:
# 적어준 변수 목록을 사용해 데이터를 트레이닝에 맞는 포맷으로 변경합니다.
train_PCA_X = train_PCA[independents]
train_PCA_y = train_PCA[dependent]

valid_PCA_X = valid_PCA[independents]

In [None]:
dtrain_PCA = xgb.DMatrix(train_PCA_X, label=train_PCA_y)
bst_PCA = xgb.train(plst,dtrain_PCA,)
dtest_PCA= xgb.DMatrix(valid_PCA_X)
ypred_PCA = bst_PCA.predict(dtest_PCA)

In [None]:
prediction_binomial_val_PCA = [1 if p>=0.4 else 0 for p in ypred_PCA]

In [None]:
evaluation(valid_PCA['fire_yes'], prediction_binomial_val_PCA)

# jmk 추가 클러스터링

In [None]:
train_c = pd.read_csv("train_3rd_cluster_jmk.csv")
test_c = pd.read_csv("test_3rd_cluster_jmk.csv")
valid_c = pd.read_csv("valid_3rd_cluster_jmk.csv")

In [None]:
# 적어준 변수 목록을 사용해 데이터를 트레이닝에 맞는 포맷으로 변경합니다.
train_c_X = train_c[independents]
train_c_y = train_c[dependent]

test_c_X = test_c[independents]
valid_c_X = valid_c[independents]

In [None]:
dtrain_c = xgb.DMatrix(train_c_X, label=train_c_y)

In [None]:
bst_c = xgb.train(plst,dtrain_c,)

In [None]:
dtest_c= xgb.DMatrix(valid_c_X)

In [None]:
ypred_c = bst_c.predict(dtest_c)

In [None]:
prediction_binomial_val_c = [1 if p>=0.4 else 0 for p in ypred_c]

In [None]:
evaluation(valid_c['fire_yes'], prediction_binomial_val_c)

## 0과 2 군집 각각 나누어서 진행

In [None]:
train_c_0 = train_c[train_c['cluster']==0]
train_c_1 = train_c[train_c['cluster']==1]

In [None]:
valid_c_0 = valid_c[valid_c['cluster']==0]
valid_c_1 = valid_c[valid_c['cluster']==1]

In [None]:
# 적어준 변수 목록을 사용해 데이터를 트레이닝에 맞는 포맷으로 변경합니다.
train_c_0_X = train_c_0[independents]
train_c_1_X = train_c_1[independents]
train_c_0_y = train_c_0[dependent]
train_c_1_y = train_c_1[dependent]

valid_c_0_X = valid_c_0[independents]
valid_c_1_X = valid_c_1[independents]

In [None]:
dtrain_c_0 = xgb.DMatrix(train_c_0_X, label=train_c_0_y)
dtrain_c_1 = xgb.DMatrix(train_c_1_X, label=train_c_1_y)

In [None]:
bst_c_0 = xgb.train(plst,dtrain_c_0,)
bst_c_1 = xgb.train(plst,dtrain_c_1,)
dtest_c_0= xgb.DMatrix(valid_c_0_X)
dtest_c_1= xgb.DMatrix(valid_c_1_X)
ypred_c_0 = bst_c_0.predict(dtest_c_0)
ypred_c_1 = bst_c_1.predict(dtest_c_1)

In [None]:
prediction_binomial_val_c_0 = [1 if p>=0.11 else 0 for p in ypred_c_0]
prediction_binomial_val_c_1 = [1 if p>=0.3 else 0 for p in ypred_c_1]

In [None]:
evaluation(valid_c_0['fire_yes'], prediction_binomial_val_c_0)

In [None]:
evaluation(valid_c_1['fire_yes'], prediction_binomial_val_c_1)

In [None]:
concat_0 = pd.DataFrame(prediction_binomial_val_c_0, valid_c_0['id']).reset_index()

In [None]:
concat_1 = pd.DataFrame(prediction_binomial_val_c_1, valid_c_1['id']).reset_index()

In [None]:
concat_id = pd.concat([concat_0, concat_1])

In [None]:
valid_c_fire = valid_c.merge(concat_id, on ='id')

In [None]:
valid_c_fire = valid_c_fire.rename(columns={0: 'fire_pred'})

In [None]:
evaluation(valid_c_fire['fire_pred'], valid_c_fire['fire_yes'])

# XGBoost 성능 비교
max_depth 의사결정나무 모형 깊이 초모수를 달리해서 XGBoost 예측모형의 성능을 비교해본다.

In [None]:
f1_score_list = []

max_depth_list = [10, 15]

In [None]:
for max_depth in max_depth_list:
    xgb_model = XGBClassifier(max_depth=max_depth)
    xgb_pred = xgb_model.fit(train_X, train_y).predict(valid_X)
    xgb_f1_score = f1_score(xgb_pred,valid['fr_yn'])
    f1_score_list.append(xgb_f1_score)

In [None]:
xgb_df = pd.DataFrame({'tree depth':max_depth_list, 'f1_score':f1_score_list})
xgb_df.head()

# XGBoost 시각화
XGBoost 모형을 시각화함으로써 개발한 예측모형의 성능에 대해 더 깊은 이해를 가질 수 있다. xgb.plot_importance() 메쏘드에 XGBoost 모형객체를 넣어 변수중요도를 파악할 수 있다.

In [None]:
xgb.plot_importance(model_xgb)

## 클러스터링한 걸로 XGBoost 시각화

In [None]:
model_xgb_c = XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, verbose = True).fit(train_c_X, train_c_y)

In [None]:
xgb.plot_importance(model_xgb_c)

# 2. 로지스틱 회귀분석 & ROC 커브

In [None]:
model_lr = LogisticRegression().fit(train_X, train_y)

In [None]:
pred_lr = model_lr.predict(valid_X)
# prediction2 = model2.predict(valid_X)

In [None]:
pred_lr

In [None]:
print('f1', metrics.f1_score(pred_lr,valid['fire_yes']) )

In [None]:
evaluation(valid['fire_yes'], pred_lr)

In [None]:
fpr1, tpr1, thresholds1 = roc_curve(valid['fire_yes'], model_lr.predict_proba(valid_X)[:, 1])

In [None]:
plt.plot(fpr1, tpr1, 'o-', ms=2, label="Logistic Regression")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.legend()
plt.xlabel('위양성률(Fall-Out)')
plt.ylabel('재현률(Recall)')
plt.title('ROC curve')
plt.show()

In [None]:
auc(fpr1, tpr1) # ROC 커브 면적

## 선형 회귀

In [None]:
model_reg = LinearRegression().fit(train_X, train_y)
# 대문자는 매트릭스, 소문자는 벡터

In [None]:
# x 계수
# x 계수의 의미를 해석하세요.
model_reg.coef_
# 피클래스가 한 계단 올라갈 때마다 죽음에 0.19배 가까워짐

In [None]:
# 절편
model_reg.intercept_

In [None]:
reg.score(train_X, train_y, sample_weight=None)

In [None]:
pred_reg = model_reg.predict(valid_X)

In [None]:
pred_binomial = [1 if p>=0.2 else 0 for p in pred_reg]

In [None]:
print('f1', metrics.f1_score(pred_binomial,valid['fr_yn']) )

# RANDOM FOREST

In [None]:
model_forest = RandomForestClassifier(random_state=42, n_estimators=10).fit(train_X, train_y)

In [None]:
model_forest.feature_importances_

In [None]:
pred_forest = model_forest.predict(valid_X)

In [None]:
pred_forest

In [None]:
print('f1', metrics.f1_score(pred_forest,valid['fr_yn']) )

In [None]:
evaluation(valid['fr_yn'], pred_forest)

# 클러스터링한 걸로 모델링

In [None]:
model_forest_c = RandomForestClassifier(random_state=42, n_estimators=10).fit(train_c_X, train_c_y)

In [None]:
pred_forest_c = model_forest_c.predict(valid_c_X)

In [None]:
evaluation(valid_c['fire_yes'], pred_forest_c)

# VALIDATION 한 번에 돌리기

In [None]:
def train_and_val(model, num):
    a = model.fit(train_X, train_y)
    prediction_val = a.predict(valid_X)
    prediction_binomial_val = [1 if p>=num else 0 for p in prediction_val]
    print('f1:', f1_score(prediction_binomial_val,valid['fr_yn']) )
    return prediction_val

In [None]:
train_and_val(LogisticRegression(), 0.5)

In [None]:
train_and_val(RandomForestClassifier(n_estimators=700, oob_score=True, max_depth=12, min_samples_leaf=16, min_samples_split=8, n_jobs=-1), 0.5)

In [None]:
train_and_val(RandomForestClassifier(random_state=10, n_estimators=100, max_depth = 15), 0.5)

In [None]:
params = {
    'n_estimators': [100],
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf': [8, 12, 16],
    'min_samples_split': [8, 16, 24]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(train_X, train_y)

print("최적 하이퍼 파라미터: ", grid_cv.best_params_)
print("최고 예측 정확도: ", grid_cv.best_score_)

In [None]:
params = {
    'n_estimators': [100, 300, 500]
}
rf_clf_1 = RandomForestClassifier(max_depth=12, min_samples_leaf=16, min_samples_split=8, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf_1, param_grid=params, cv = 3, n_jobs=-1)
grid_cv.fit(train_X, train_y)
print("최적 파라미터: ", grid_cv.best_params_)
print("최고 예측 정확도: ", grid_cv.best_score_)
best_model = grid_cv.best_estimator_
pred = best_model.predict(valid_X)
f1 = f1_score(valid['fire_yes'], pred)
print("f1 score: {0: .4f}".format(f1))

In [None]:
train_and_val(KNeighborsClassifier(n_neighbors = 4), 0.1)

In [None]:
train_and_val(GaussianNB(), 0.5)

In [None]:
# train_and_val(SVC())

# 클러스터링한 애들로 한 번에 돌리기

In [None]:
def train_and_val_c(model, num):
    a = model.fit(train_c_X, train_c_y)
    prediction_val = a.predict(valid_c_X)
    prediction_binomial_val = [1 if p>=num else 0 for p in prediction_val]
    print('f1:', f1_score(prediction_binomial_val,valid['fr_yn']) )
    return prediction_val

In [None]:
train_and_val_c(LogisticRegression(), 0.5)

In [None]:
train_and_val_c(RandomForestClassifier(n_estimators=700, oob_score=True, max_depth=12, min_samples_leaf=16, min_samples_split=8, n_jobs=-1), 0.5)

In [None]:
train_and_val_c(RandomForestClassifier(random_state=10, n_estimators=100, max_depth = 15), 0.5)

In [None]:
train_and_val_c(KNeighborsClassifier(n_neighbors = 4), 0.1)

In [None]:
train_and_val_c(GaussianNB(), 0.5)