In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import confusion_matrix,classification_report
import tensorflow as tf
import joblib
%matplotlib inline

np.random.seed(3)
tf.random.set_seed(3)

In [2]:
# 건강 보험 공단 딥러닝, SVM, RF 결과 저장한npy
pre_df = np.load('ensemble_x.npy')
y_cate = np.load('y_cate.npy')
y = np.load('y.npy')

pre_df

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

## Stacking Ensemble

In [9]:
from sklearn import metrics

In [10]:
from tensorflow.keras.layers import Input,Dense,SimpleRNN, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L1, L2,L1L2
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold

In [11]:
METRICS = [ 
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [12]:
#, kernel_regularizer=l2(0.02)
model = Sequential()
model.add(Dense(512,activation="relu",  input_dim=3))
model.add(Dense(256,activation="relu", kernel_regularizer=L2(0.001)))
model.add(Dense(128,activation="relu"))
model.add(Dense(64,activation="relu", kernel_regularizer=L2(0.001)))
model.add(Dense(32,activation="relu"))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=L2(0.001)))
    
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=METRICS)

In [13]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [14]:
from sklearn.model_selection import StratifiedKFold,KFold 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [18]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [16]:
kfold = StratifiedKFold(n_splits=5) 
cnt_iter=0
cv_accuracy=[]
AUC=[]

In [19]:
for train, test in kfold.split(pre_df,y):
    cnt_iter+=1
    
    X_train, X_test = pre_df[train], pre_df[test]
    y_train, y_test = y[train], y[test]
    #y_train1, y_test1 = y[train], y[test]

    model.fit(X_train,y_train,epochs=100,batch_size=128,verbose=0,callbacks=[early_stopping])#,validation_split=0.2,class_weight=class_dict)
    pred = model.predict_classes(X_test)

    acc = np.round(accuracy_score(y_test,pred),4)
    print(acc)
    
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    

    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)) #곡선 그리기

    score = metrics.auc(fpr, tpr) #면적 구하기
    AUC.append(score)
    print(confusion_matrix(y_test,pred,labels=[1,0]))
    print(classification_report(y_test,pred))
    print('\n AUC score : {0}'.format(score))
    print('\n#{0} 교차 검증 정확도 : {1}, 학습 데이터 크기 : {2}, 검증 데이터 크기 : {3}'.format(cnt_iter,acc,train_size,test_size))
    cv_accuracy.append(acc)

Restoring model weights from the end of the best epoch.
Epoch 00028: early stopping




0.9169




[[ 651   51]
 [ 557 6056]]
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      6613
           1       0.54      0.93      0.68       702

    accuracy                           0.92      7315
   macro avg       0.77      0.92      0.82      7315
weighted avg       0.95      0.92      0.93      7315


 AUC score : 0.9436884656527783

#2 교차 검증 정확도 : 0.9169, 학습 데이터 크기 : 29260, 검증 데이터 크기 : 7315
Restoring model weights from the end of the best epoch.
Epoch 00029: early stopping




0.9137




[[ 658   44]
 [ 587 6026]]
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      6613
           1       0.53      0.94      0.68       702

    accuracy                           0.91      7315
   macro avg       0.76      0.92      0.81      7315
weighted avg       0.95      0.91      0.92      7315


 AUC score : 0.9398908004306462

#3 교차 검증 정확도 : 0.9137, 학습 데이터 크기 : 29260, 검증 데이터 크기 : 7315
Restoring model weights from the end of the best epoch.
Epoch 00030: early stopping




0.909




[[ 657   46]
 [ 620 5992]]
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      6612
           1       0.51      0.93      0.66       703

    accuracy                           0.91      7315
   macro avg       0.75      0.92      0.81      7315
weighted avg       0.95      0.91      0.92      7315


 AUC score : 0.9370019078205152

#4 교차 검증 정확도 : 0.909, 학습 데이터 크기 : 29260, 검증 데이터 크기 : 7315
Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping




0.9117




[[ 657   46]
 [ 600 6012]]
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      6612
           1       0.52      0.93      0.67       703

    accuracy                           0.91      7315
   macro avg       0.76      0.92      0.81      7315
weighted avg       0.95      0.91      0.92      7315


 AUC score : 0.9376753891153549

#5 교차 검증 정확도 : 0.9117, 학습 데이터 크기 : 29260, 검증 데이터 크기 : 7315
Restoring model weights from the end of the best epoch.
Epoch 00021: early stopping




0.9109




[[ 657   46]
 [ 606 6006]]
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      6612
           1       0.52      0.93      0.67       703

    accuracy                           0.91      7315
   macro avg       0.76      0.92      0.81      7315
weighted avg       0.95      0.91      0.92      7315


 AUC score : 0.9382680870764737

#6 교차 검증 정확도 : 0.9109, 학습 데이터 크기 : 29260, 검증 데이터 크기 : 7315


In [20]:
# 평균 AUC

print(sum(cv_accuracy)/5)

0.9124399999999999


* StratifiedKFold을 사용함으로써 불균형 데이터를 균등하게 나눠 교차 검증 가능

* 기존에 사용했던 class_weight를 통한 레이블에 부여하는 가중치를 없애고 교차검증 실시