### Import common module

In [None]:
import import_ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, f1_score, recall_score
from xgboost import XGBClassifier, plot_importance

import seaborn as sns

import matplotlib as mpl
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', -1)

from traffic_common import get_category_age, get_category_season, get_category_time, drop_features, cleansing, \
                            encode_features, conv2XYarr, transform_dataframe, bar_chart, pie_chart

In [None]:
def getEvaluation(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)    
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('confusion matrix..'); print(confusion)    
    print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
def getEvaluationByThreshold(y_test, pred_proba, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba)
        custom_predict = binarizer.transform(pred_proba)
        print('threshold:', custom_threshold)
        getEvaluationWithoutAUC(y_test, custom_predict)

### Load dataset and Pre-processing

In [None]:
traffic_df = pd.read_csv('dataset/seoul_traffic.csv', encoding='euc-kr')

In [None]:
traffic_df = transform_dataframe(traffic_df)

In [None]:
traffic_df.head()

In [None]:
X = traffic_df.iloc[:,1:]
Y = traffic_df.iloc[:,0]

In [None]:
X = pd.get_dummies(X)
Y = pd.get_dummies(Y).iloc[:,1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    random_state=1) 

### XGBoost

In [None]:
xgb_clf = XGBClassifier()

params={
    'max_depth':[3, 5, 7], 
    'learning_rate': [0.1, 0.2],
    'n_estimators': [100, 200]
}

gridcv = GridSearchCV(xgb_clf, param_grid=params, scoring="accuracy", n_jobs=-1)
gridcv.fit(X_train[:], y_train[:])

print('GridSearchCV optimal parameters:', gridcv.best_params_)

In [None]:
cv_result_df = pd.DataFrame(gridcv.cv_results_)
cv_result_df.sort_values(by=['rank_test_score'], inplace=True)

cv_result_df[['params', 'mean_test_score', 'rank_test_score']].head(30)

In [None]:
xgb_clf = XGBClassifier(n_estimators=200, random_state=1, learning_rate=0.1, max_depth=3)

xgb_clf.fit(X_train, y_train)

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1], average='macro')
print('ROC_AUC: {0:.4f}'.format(xgb_roc_score))

In [None]:
predicts = xgb_clf.predict(X_test)
predict_probabilities = xgb_clf.predict_proba(X_test)[: ,1]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,8))
plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)