In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
y= pd.read_csv("/content/y_train.csv")
y=y['BBclass']
test_y=pd.read_csv("/content/y_external.csv")
test_y=test_y['BBclass']
test_x=pd.read_csv("/content/X_external.csv")
x=pd.read_csv("/content/X_train.csv")

In [None]:
#error방지를 위해 변수명에 쉼표와 부등호 다 없애기
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
x.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x.columns.values]
test_x.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_x.columns.values]


In [None]:
#using the stratified 10-fold CV 
folds=10
skfold = StratifiedKFold(n_splits=folds)
cv_accuracy=[]
  # StratifiedKFold의 split( ) 호출시 반드시 레이블 데이터 셋도 추가 입력 필요  
for train_index, test_index  in skfold.split(x, y):
    # split( )으로 반환된 인덱스를 이용하여 학습용, 검증용 테스트 데이터 추출
    x_train, x_test = x.values[train_index], x.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]

In [None]:
len(y_train)

6446

In [None]:
#list화 시킨다
x_train_list = x_train.tolist()
x_test_list=x_test.tolist()

In [None]:
#각 샘플들의 길이가 다르니 패딩을 진행하여 모든 샘플들의 길이를 200으로 맞춘다.
max_len = 200
tranx = pad_sequences(x_train_list, maxlen = max_len)
trantestx = pad_sequences(x_test_list, maxlen = max_len)

In [None]:
tranx.shape

(6446, 200)

In [None]:
from tensorflow import keras
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
Metrics= [keras.metrics.BinaryAccuracy(name='accuracy'),
          'matthews_correlation', 
          f1_score,balanced_accuracy_score,
          keras.metrics.AUC(name='auc')
]

In [None]:
vocab_size=len(x_train_list)
embedding_dim = 256
dropout_ratio = 0.3
num_filters = 256
kernel_size = 3
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='sigmoid'))
#검증 데이터의 손실(loss)이 증가하면, 과적합 징후이므로 검증 데이터 손실이 3회 증가하면 학습을 중단하는 조기 종료(EarlyStopping)를 사용합니다.
#또한, ModelCheckpoint를 사용하여 검증 데이터의 정확도가 이전보다 좋아질 경우에만 모델을 저장하도록 합니다.
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
history = model.fit(tranx, y_train, epochs = 20, validation_data = (trantestx, y_test), callbacks=[es, mc])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping


In [None]:
#형태변경
test_x_list=test_x.values.tolist()
trantest_x_list= pad_sequences(test_x_list, maxlen = max_len)

In [None]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(trantest_x_list, test_y)[1]))


 테스트 정확도: 0.7027


In [None]:
pred=model.predict(trantestx)
pred

array([[1.84992850e-02],
       [1.74483150e-01],
       [5.43053746e-02],
       [2.25057602e-02],
       [5.26402473e-01],
       [2.25173891e-01],
       [2.71172106e-01],
       [3.72798741e-02],
       [4.51425135e-01],
       [7.16543674e-01],
       [3.02318633e-01],
       [1.92173421e-02],
       [5.91879487e-02],
       [3.90552878e-02],
       [3.62055242e-01],
       [7.03190863e-02],
       [1.96879506e-02],
       [6.67564750e-01],
       [1.57228261e-01],
       [1.65104866e-04],
       [1.99884385e-01],
       [3.96439850e-01],
       [2.36672163e-03],
       [4.68114018e-03],
       [1.62584424e-01],
       [7.18827844e-02],
       [7.92309642e-03],
       [8.80926847e-03],
       [4.81241494e-01],
       [1.05706155e-02],
       [9.29945707e-03],
       [2.20164657e-03],
       [1.66727901e-02],
       [5.44881225e-02],
       [3.95013392e-01],
       [2.81645358e-02],
       [2.49679476e-01],
       [4.94956583e-01],
       [4.95384634e-02],
       [2.40665674e-02],


In [None]:
pred1=np.where(pred >= 0.5, 1, 0)

In [None]:
accuracy = accuracy_score(y_test , pred1)
f1 = f1_score(y_test,pred1) 
matthews=matthews_corrcoef(y_test,pred1)
balance=balanced_accuracy_score(y_test,pred1)
roc_auc = roc_auc_score(y_test, pred1)
print('정확도: {0:.4f}, f1: {1:.4f}, matt: {2:.4f},\
          balance: {3:.4f},ROC_AUC:{4:.4f}'.format(accuracy, f1,matthews, balance,roc_auc))

정확도: 0.8939, f1: 0.9291, matt: 0.7198,          balance: 0.8721,ROC_AUC:0.8721


In [None]:
pred2=model.predict(trantest_x_list)
pred2

array([[0.95374936],
       [0.84757614],
       [0.98338664],
       [0.7121794 ],
       [0.37098518],
       [0.56558526],
       [0.77249134],
       [0.51039314],
       [0.5105405 ],
       [0.7341955 ],
       [0.9558568 ],
       [0.47483587],
       [0.8743099 ],
       [0.95000416],
       [0.41079336],
       [0.8302753 ],
       [0.54199755],
       [0.28223038],
       [0.18495786],
       [0.7599881 ],
       [0.9474293 ],
       [0.95368594],
       [0.77114147],
       [0.94128215],
       [0.9064222 ],
       [0.9313022 ],
       [0.87729293],
       [0.8410353 ],
       [0.9225879 ],
       [0.3729921 ],
       [0.7126559 ],
       [0.9659846 ],
       [0.57906485],
       [0.39108172],
       [0.3712148 ],
       [0.77950263],
       [0.7962779 ],
       [0.7879473 ],
       [0.7340961 ],
       [0.91227704],
       [0.44661218],
       [0.4176841 ],
       [0.7618849 ],
       [0.70109   ],
       [0.7053762 ],
       [0.6906985 ],
       [0.7165437 ],
       [0.945

In [None]:
pred3=np.where(pred2 >= 0.5, 1, 0)
accuracy = accuracy_score(test_y , pred3)
f1 = f1_score(test_y,pred3) 
matthews=matthews_corrcoef(test_y,pred3)
balance=balanced_accuracy_score(test_y,pred3)
roc_auc = roc_auc_score(test_y, pred3)
print('정확도: {0:.4f}, f1: {1:.4f}, matt: {2:.4f},\
          balance: {3:.4f},ROC_AUC:{4:.4f}'.format(accuracy, f1,matthews, balance,roc_auc))

정확도: 0.7027, f1: 0.7660, matt: 0.4346,          balance: 0.6901,ROC_AUC:0.6901
