In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import confusion_matrix,classification_report
import tensorflow as tf
import joblib
%matplotlib inline

np.random.seed(3)
tf.random.set_seed(3)

In [2]:
pre = pd.read_csv('kd_data_1226.csv')

pre_df = pd.DataFrame(pre)

pre_df

p_list = pre_df['person_id']

## class_weight

In [3]:
from sklearn.utils import class_weight

## Split

In [4]:
from sklearn.model_selection import train_test_split

## Stacking Ensemble

In [5]:
from sklearn import metrics

In [6]:
from tensorflow.keras.layers import Input,Dense,SimpleRNN, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L1, L2,L1L2
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from tensorflow.keras.optimizers import Adam, RMSprop

In [7]:
METRICS = [ 
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [8]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [9]:
from sklearn.metrics import roc_curve, auc, roc_auc_score  # ROC곡선 그리기

from sklearn.model_selection import learning_curve, validation_curve # 학습곡선, 검증곡선
from sklearn.model_selection import  cross_val_score, cross_val_predict  # 하이퍼파라미터 튜닝, 교차타당도

## Application - KD

In [10]:
from tensorflow.keras.optimizers import RMSprop,Adam

In [11]:
pre_df.columns = ['person_id', 'gender', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5',
       'age_6', 'age_7', 'age_8', 'smoking_status_0.0', 'smoking_status_1.0',
       'smoking_status_2.0', 'smoking_status_3.0', 'bmi_1.0', 'bmi_2.0', 'bmi_3.0',
       'bmi_4.0', 'Malignant_neoplastic_disease', 'Chronic_liver_disease',
       'chronic_obstructive_lung_disease', 'cerebrovascular_disease',
       'chronic_kidney_disease', 'Diabetes_mellitus', 'Ischemic_heart_disease',
       'hyperlipidemia', 'Hypertensive_disorder', 'cancer'] 

pre_df = pre_df.astype('int64')

In [12]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('cancer')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)

    return ds

# layer와 feature를 나누고 섞고(shuffle), batch_size로 생성하는 과정

In [13]:
train, test = train_test_split(pre_df, test_size=0.3, random_state=3)
train_ds= df_to_dataset(train, batch_size=len(train),shuffle=False)
test_ds= df_to_dataset(test, batch_size=len(test),shuffle=False)

In [14]:
all_ds = df_to_dataset(pre_df, batch_size=len(pre_df),shuffle=False)

In [15]:
print(Counter(pre_df['cancer']))

Counter({0: 320, 1: 89})


In [16]:
from tensorflow.keras.utils import to_categorical

In [17]:
Y_cate = to_categorical(pre_df['cancer'])
X_data = pre_df.iloc[:,1:27]

In [18]:
X_data

Unnamed: 0,gender,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,smoking_status_0.0,...,bmi_4.0,Malignant_neoplastic_disease,Chronic_liver_disease,chronic_obstructive_lung_disease,cerebrovascular_disease,chronic_kidney_disease,Diabetes_mellitus,Ischemic_heart_disease,hyperlipidemia,Hypertensive_disorder
0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
405,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
406,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
407,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data,Y_cate, test_size=0.3, random_state=3)

In [20]:
training, testing = train_test_split(pre_df['cancer'], test_size=0.3, random_state=3)

In [21]:
# test

from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',np.unique(pre_df['cancer']),pre_df['cancer'])

class_weights 

1      1
2      1
3      1
4      1
      ..
404    0
405    0
406    0
407    0
408    0
Name: cancer, Length: 409, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


array([0.6390625 , 2.29775281])

In [22]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [23]:
clf_labels = ['SVM','Random Forest']

pred = []

for label in clf_labels:
    model = joblib.load(label+'.pkl') # 저장한 모델 불러오기
    model = model.fit(X_train,training)
    Y_pred = model.predict(X_test)
    Y_pred = [ 1 if x >= 0.5 else 0 for x in Y_pred]
    print(classification_report(testing,Y_pred))
    fpr, tpr, thresholds =roc_curve(testing, model.predict_proba(X_test)[:,1])
    print(auc(fpr, tpr))
    real_pred = model.predict(X_data)
    real_pred = [ 1 if x >= 0.5 else 0 for x in real_pred]
    pred.append(real_pred)

              precision    recall  f1-score   support

           0       0.94      0.68      0.79        95
           1       0.44      0.86      0.59        28

    accuracy                           0.72       123
   macro avg       0.69      0.77      0.69       123
weighted avg       0.83      0.72      0.75       123

0.7477443609022557
              precision    recall  f1-score   support

           0       0.94      0.66      0.78        95
           1       0.43      0.86      0.57        28

    accuracy                           0.71       123
   macro avg       0.68      0.76      0.67       123
weighted avg       0.82      0.71      0.73       123

0.875187969924812


In [24]:
[(_, train_label)] = train_ds.take(1)
[(_, test_label)] = test_ds.take(1)

In [25]:
[(_, all_label)] = all_ds.take(1)

In [26]:
gbd_model = tf.keras.models.load_model('gbd_0108')

In [27]:
class_weights = {0:class_weights[0], 1:class_weights[1]}

In [28]:
gbd_model.compile(optimizer=Adam(learning_rate=0.005),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=METRICS) # 기존 건보단 모델에서 optimizer의 학습률을 조정해 새로 학습

In [29]:
history = gbd_model.fit(train_ds, epochs=100,callbacks=[early_stopping],class_weight = class_weights)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100


Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Restoring model weights from the end of the best epoch.
Epoch 00067: early stopping


In [30]:
D_pred = gbd_model.predict(test_ds)
D_pred = [ 1 if x >= 0.5 else 0 for x in D_pred]

print(confusion_matrix(test_label,D_pred,labels=[1,0]))
print(classification_report(test_label,D_pred))
results = gbd_model.evaluate(test_ds)
        
for name, value in zip(gbd_model.metrics_names, results):
    print(name, ': ', value)
       
real_pred = gbd_model.predict(all_ds)
real_pred = [ 1 if x >= 0.5 else 0 for x in real_pred]
pred.append(real_pred)

[[25  3]
 [25 70]]
              precision    recall  f1-score   support

           0       0.96      0.74      0.83        95
           1       0.50      0.89      0.64        28

    accuracy                           0.77       123
   macro avg       0.73      0.81      0.74       123
weighted avg       0.85      0.77      0.79       123





loss :  0.42386916279792786
tp :  25.0
fp :  25.0
tn :  70.0
fn :  3.0
accuracy :  0.772357702255249
precision :  0.5
recall :  0.8928571343421936
auc :  0.8962405920028687
prc :  0.737336277961731


In [31]:
kd_data = np.array(pred)
print(kd_data.shape)

kd_data = np.transpose(kd_data)
print(kd_data.shape)

(3, 409)
(409, 3)


## KD Ensemble model 생성 
* 교차검증 실시

In [33]:
from sklearn.model_selection import StratifiedKFold,KFold 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [34]:
kfold = StratifiedKFold(n_splits=5) 
cnt_iter=0
cv_accuracy=[]
AUC=[]

In [36]:
em_y = np.array(pre_df['cancer'])

In [37]:
#, kernel_regularizer=l2(0.02)
model = Sequential()
model.add(Dense(32,activation="relu",  input_dim=3))
model.add(Dropout(0.01))
model.add(Dense(16,activation="relu", kernel_regularizer=L2(0.01)))
model.add(Dense(10,activation="relu", kernel_regularizer=L2(0.01)))
model.add(Dense(1, activation='sigmoid'))
    
model.compile(optimizer=RMSprop(learning_rate=0.005),loss='binary_crossentropy',metrics=METRICS)

In [38]:
for train, test in kfold.split(kd_data,em_y):
    cnt_iter+=1
    
    X_train, X_test = kd_data[train], kd_data[test]
    y_train, y_test = em_y[train], em_y[test]

    model.fit(X_train,y_train,epochs=100,batch_size=64,verbose=0,callbacks=[early_stopping])#,validation_split=0.2,class_weight=class_dict)
    pred = model.predict_classes(X_test)

    acc = np.round(accuracy_score(y_test,pred),4)
    print(acc)
    
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    

    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)) #곡선 그리기

    score = metrics.auc(fpr, tpr) #면적 구하기
    AUC.append(score)
    print(confusion_matrix(y_test,pred,labels=[1,0]))
    print(classification_report(y_test,pred))
    print('\n AUC score : {0}'.format(score))
    print('\n#{0} 교차 검증 정확도 : {1}, 학습 데이터 크기 : {2}, 검증 데이터 크기 : {3}'.format(cnt_iter,acc,train_size,test_size))
    cv_accuracy.append(acc)

Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
0.8902
[[15  3]
 [ 6 58]]
              precision    recall  f1-score   support

           0       0.95      0.91      0.93        64
           1       0.71      0.83      0.77        18

    accuracy                           0.89        82
   macro avg       0.83      0.87      0.85        82
weighted avg       0.90      0.89      0.89        82


 AUC score : 0.9375

#1 교차 검증 정확도 : 0.8902, 학습 데이터 크기 : 327, 검증 데이터 크기 : 82




Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
0.7805
[[ 0 18]
 [ 0 64]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.88        64
           1       0.00      0.00      0.00        18

    accuracy                           0.78        82
   macro avg       0.39      0.50      0.44        82
weighted avg       0.61      0.78      0.68        82


 AUC score : 0.8051215277777778

#2 교차 검증 정확도 : 0.7805, 학습 데이터 크기 : 327, 검증 데이터 크기 : 82


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
0.8415
[[16  2]
 [11 53]]
              precision    recall  f1-score   support

           0       0.96      0.83      0.89        64
           1       0.59      0.89      0.71        18

    accuracy                           0.84        82
   macro avg       0.78      0.86      0.80        82
weighted avg       0.88      0.84      0.85        82


 AUC score : 0.8936631944444444

#3 교차 검증 정확도 : 0.8415, 학습 데이터 크기 : 327, 검증 데이터 크기 : 82




Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping
0.8171
[[15  3]
 [12 52]]
              precision    recall  f1-score   support

           0       0.95      0.81      0.87        64
           1       0.56      0.83      0.67        18

    accuracy                           0.82        82
   macro avg       0.75      0.82      0.77        82
weighted avg       0.86      0.82      0.83        82


 AUC score : 0.8602430555555556

#4 교차 검증 정확도 : 0.8171, 학습 데이터 크기 : 327, 검증 데이터 크기 : 82




Restoring model weights from the end of the best epoch.
Epoch 00028: early stopping
0.8519
[[16  1]
 [11 53]]
              precision    recall  f1-score   support

           0       0.98      0.83      0.90        64
           1       0.59      0.94      0.73        17

    accuracy                           0.85        81
   macro avg       0.79      0.88      0.81        81
weighted avg       0.90      0.85      0.86        81


 AUC score : 0.9016544117647058

#5 교차 검증 정확도 : 0.8519, 학습 데이터 크기 : 328, 검증 데이터 크기 : 81




In [39]:
# 평균 AUC

print(sum(cv_accuracy)/5)

0.8362399999999999
