In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
train = pd.read_csv('../../datasets/all_data_filled_train_data.csv')
test = pd.read_csv('../../datasets/all_data_test_data.csv')

In [4]:
# train_life_cycle = train[['도입기', '성장기', '성숙기', '쇠퇴기']]
# test_life_cycle = test[['도입기', '성장기', '성숙기', '쇠퇴기']]

In [3]:
selected_features = ['CASH FLOW 대 부채비율', 'CASH FLOW 대 총자본비율', 'CASH FLOW 대 매출액비율', '차입금의존도', '순운전자본비율',
                     '자기자본구성비율', '경영자본순이익률', '총자본사업이익률', '총자본영업이익률', '금융비용부담률', 
                     '매출액증가율', '이윤분배율', '총자본회전률', '영업년수', 
                     '도입기', '성장기', '성숙기', '쇠퇴기']

In [4]:
x_train = train[selected_features]
x_test = test[selected_features]

y_train = train['부실판단']
y_test = test['부실판단']

In [5]:
x_test = x_test.fillna(test['영업년수'].median())

In [8]:
# 모델 생성
logit_model = LogisticRegression()

# Cross Validation
cv_accuracy = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='roc_auc')

print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 결과=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# 모델 학습 및 평가
logit_model.fit(x_train, y_train)
y_pred = logit_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 스코어: {f1:.3f}')
print(f'ROC AUC 스코어: {roc_auc:.3f}')


CV_Accuracy_Scores: [0.70091324 0.81023065 0.83238182 0.82050697 0.71157799]
CV_Precision_Scores: [0.68691589 0.80919854 0.80074411 0.81880109 0.67353051]
CV_Recall_Scores: [0.73835616 0.8117862  0.88487894 0.82328767 0.82146119]
CV_F1_Scores: [0.71170775 0.81049031 0.84071181 0.82103825 0.74017692]
CV_ROC/AUC: [0.76411293 0.88872173 0.90875194 0.90097999 0.76688882]

CV_Accuracy_mean: 0.775
CV_Precision_mean: 0.758
CV_Recall_mean: 0.816
CV_F1_스코어_mean: 0.785
CV_ROC_AUC+스코어_mean: 0.846

Accuracy: 0.747
Precision: 0.644
Recall: 0.720
F1 스코어: 0.679
ROC AUC 스코어: 0.741


In [6]:
# Random Forest 모델 생성 및 학습
rf_model = RandomForestClassifier(random_state=42, n_estimators=29, min_samples_split=7, min_samples_leaf=8, max_depth=7)

# # Cross Validation
# cv_accuracy = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='accuracy')
# cv_precision = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='precision')
# cv_recall = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='recall')
# cv_f1 = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='f1')
# cv_roc_auc = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='roc_auc')

# print("CV_Accuracy_Scores:", cv_accuracy)
# print("CV_Precision_Scores:", cv_precision)
# print("CV_Recall_Scores:", cv_recall)
# print("CV_F1_Scores:", cv_f1)
# print("CV_ROC/AUC:", cv_roc_auc)

# print('\n=======교차검증 결과=======')
# print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
# print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
# print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
# print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
# print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')


rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_rf:.3f}')
print(f'Precision: {precision_rf:.3f}')
print(f'Recall: {recall_rf:.3f}')
print(f'F1 스코어: {f1_rf:.3f}')
print(f'ROC AUC 스코어: {roc_auc_rf:.3f}')


Accuracy: 0.835
Precision: 0.742
Recall: 0.856
F1 스코어: 0.795
ROC AUC 스코어: 0.840


In [10]:
# AdaBoost 모델 생성 및 학습
adaboost_model = AdaBoostClassifier()

# Cross Validation
cv_accuracy = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

adaboost_model.fit(x_train, y_train)
y_pred_adaboost = adaboost_model.predict(x_test)

accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
precision_adaboost = precision_score(y_test, y_pred_adaboost)
recall_adaboost = recall_score(y_test, y_pred_adaboost)
f1_adaboost = f1_score(y_test, y_pred_adaboost)
roc_auc_adaboost = roc_auc_score(y_test, y_pred_adaboost)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_adaboost:.3f}')
print(f'Precision: {precision_adaboost:.3f}')
print(f'Recall: {recall_adaboost:.3f}')
print(f'F1 스코어: {f1_adaboost:.3f}')
print(f'ROC AUC 스코어: {roc_auc_adaboost:.3f}')

CV_Accuracy_Scores: [0.71347032 0.85316282 0.88193652 0.88650377 0.7218543 ]
CV_Precision_Scores: [0.68828031 0.83261618 0.88069217 0.88600091 0.68381241]
CV_Recall_Scores: [0.7803653  0.88396528 0.88350845 0.88721461 0.82557078]
CV_F1_Scores: [0.73143591 0.85752271 0.88209806 0.88660735 0.74803475]
CV_ROC/AUC: [0.79298107 0.92331239 0.9446504  0.95237072 0.78994798]

CV_Accuracy_mean: 0.811
CV_Precision_mean: 0.794
CV_Recall_mean: 0.852
CV_F1_스코어_mean: 0.821
CV_ROC_AUC+스코어_mean: 0.881

Accuracy: 0.788
Precision: 0.692
Recall: 0.775
F1 스코어: 0.732
ROC AUC 스코어: 0.785


In [11]:
xgboost_model = XGBClassifier()

xgboost_model.fit(x_train, y_train)
y_pred_xgboost = xgboost_model.predict(x_test)

accuracy_xgboost = accuracy_score(y_test, y_pred_xgboost)
precision_xgboost = precision_score(y_test, y_pred_xgboost)
recall_xgboost = recall_score(y_test, y_pred_xgboost)
f1_xgboost = f1_score(y_test, y_pred_xgboost)
roc_auc_xgboost = roc_auc_score(y_test, y_pred_xgboost)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_xgboost:.3f}')
print(f'Precision: {precision_xgboost:.3f}')
print(f'Recall: {recall_xgboost:.3f}')
print(f'F1 스코어: {f1_xgboost:.3f}')
print(f'ROC AUC 스코어: {roc_auc_xgboost:.3f}')


Accuracy: 0.875
Precision: 0.807
Recall: 0.873
F1 스코어: 0.838
ROC AUC 스코어: 0.874


In [12]:
# Bagging 모델 생성 및 학습
bagging_model = BaggingClassifier()

cv_accuracy = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

bagging_model.fit(x_train, y_train)
y_pred_bagging = bagging_model.predict(x_test)

accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging)
recall_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)
roc_auc_bagging = roc_auc_score(y_test, y_pred_bagging)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_bagging:.3f}')
print(f'Precision: {precision_bagging:.3f}')
print(f'Recall: {recall_bagging:.3f}')
print(f'F1 스코어: {f1_bagging:.3f}')
print(f'ROC AUC 스코어: {roc_auc_bagging:.3f}')

CV_Accuracy_Scores: [0.70639269 0.84425668 0.90888331 0.92943594 0.70792418]
CV_Precision_Scores: [0.7288049  0.85036496 0.91232877 0.91543624 0.69201359]
CV_Recall_Scores: [0.67762557 0.84833257 0.91411603 0.94018265 0.73196347]
CV_F1_Scores: [0.69005011 0.84005563 0.91332117 0.92352941 0.71867294]
CV_ROC/AUC: [0.79112383 0.91669295 0.96420542 0.96801108 0.78183925]

CV_Accuracy_mean: 0.819
CV_Precision_mean: 0.820
CV_Recall_mean: 0.822
CV_F1_스코어_mean: 0.817
CV_ROC_AUC+스코어_mean: 0.884

Accuracy: 0.924
Precision: 0.899
Recall: 0.897
F1 스코어: 0.898
ROC AUC 스코어: 0.918


In [13]:
# from sklearn.svm import SVC

# # SVM 모델 생성 및 학습
# svm_model = SVC(kernel='linear')

# cv_accuracy = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='accuracy')
# cv_precision = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='precision')
# cv_recall = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='recall')
# cv_f1 = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='f1')
# cv_roc_auc = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='roc_auc')

# print('=======교차검증 결과=======')
# print("CV_Accuracy_Scores:", cv_accuracy)
# print("CV_Precision_Scores:", cv_precision)
# print("CV_Recall_Scores:", cv_recall)
# print("CV_F1_Scores:", cv_f1)
# print("CV_ROC/AUC:", cv_roc_auc)

# print('\n=======교차검증 평균값=======')
# print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
# print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
# print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
# print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
# print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# svm_model.fit(x_train, y_train)
# y_pred_svm = svm_model.predict(x_test)

# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# precision_svm = precision_score(y_test, y_pred_svm)
# recall_svm = recall_score(y_test, y_pred_svm)
# f1_svm = f1_score(y_test, y_pred_svm)
# roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

# print(f'\n=======Test 데이터 평가======')
# print(f'Accuracy: {accuracy_svm:.3f}')
# print(f'Precision: {precision_svm:.3f}')
# print(f'Recall: {recall_svm:.3f}')
# print(f'F1 스코어: {f1_svm:.3f}')
# print(f'ROC AUC 스코어: {roc_auc_svm:.3f}')

In [14]:
# from sklearn.svm import SVC

# # SVM 모델 생성 및 학습
# svm_model = SVC(kernel='rbf')

# cv_accuracy = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='accuracy')
# cv_precision = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='precision')
# cv_recall = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='recall')
# cv_f1 = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='f1')
# cv_roc_auc = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='roc_auc')

# print('=======교차검증 결과=======')
# print("CV_Accuracy_Scores:", cv_accuracy)
# print("CV_Precision_Scores:", cv_precision)
# print("CV_Recall_Scores:", cv_recall)
# print("CV_F1_Scores:", cv_f1)
# print("CV_ROC/AUC:", cv_roc_auc)

# print('\n=======교차검증 평균값=======')
# print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
# print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
# print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
# print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
# print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# svm_model.fit(x_train, y_train)
# y_pred_svm = svm_model.predict(x_test)

# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# precision_svm = precision_score(y_test, y_pred_svm)
# recall_svm = recall_score(y_test, y_pred_svm)
# f1_svm = f1_score(y_test, y_pred_svm)
# roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

# print(f'\n=======Test 데이터 평가======')
# print(f'Accuracy: {accuracy_svm:.3f}')
# print(f'Precision: {precision_svm:.3f}')
# print(f'Recall: {recall_svm:.3f}')
# print(f'F1 스코어: {f1_svm:.3f}')
# print(f'ROC AUC 스코어: {roc_auc_svm:.3f}')


In [15]:
from lightgbm import LGBMClassifier

# LGBM 모델 생성 및 학습
lgbm_model = LGBMClassifier()

cv_accuracy = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

lgbm_model.fit(x_train, y_train)
y_pred_lgbm = lgbm_model.predict(x_test)

accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
precision_lgbm = precision_score(y_test, y_pred_lgbm)
recall_lgbm = recall_score(y_test, y_pred_lgbm)
f1_lgbm = f1_score(y_test, y_pred_lgbm)
roc_auc_lgbm = roc_auc_score(y_test, y_pred_lgbm)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_lgbm:.3f}')
print(f'Precision: {precision_lgbm:.3f}')
print(f'Recall: {recall_lgbm:.3f}')
print(f'F1 스코어: {f1_lgbm:.3f}')
print(f'ROC AUC 스코어: {roc_auc_lgbm:.3f}')


[LightGBM] [Info] Number of positive: 8758, number of negative: 8758
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3575
[LightGBM] [Info] Number of data points in the train set: 17516, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 8759, number of negative: 8758
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3578
[LightGBM] [Info] Number of data points in the train set: 17517, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500029 -> initscore=0.000114
[LightGBM] [Info] Start training from score 0.000114
[LightGBM] [Info] Number of positive: 8759, number of negative: 8758
[Lig