In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
train = pd.read_csv('../../datasets/listed_filled_train_data.csv')
test = pd.read_csv('../../datasets/listed_test_data.csv')

In [3]:
# train_life_cycle = train[['도입기', '성장기', '성숙기', '쇠퇴기']]
# test_life_cycle = test[['도입기', '성장기', '성숙기', '쇠퇴기']]

In [3]:
selected_features = ['CASH FLOW 대 부채비율', '당좌비율', '순운전자본비율', '자기자본구성비율','경영자본순이익률',
                     '총자본영업이익률', '매출액영업이익률', '금융비용부담률', '이윤분배율', '유형자산회전율',
                     '상장년수', 'PCR', '쭈피처', '도입기', '성장기', '성숙기', '쇠퇴기']

In [4]:
x_train = train[selected_features]
x_test = test[selected_features]

y_train = train['부실판단']
y_test = test['부실판단']

In [None]:
# 모델 생성
logit_model = LogisticRegression()

# Cross Validation
cv_accuracy = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='roc_auc')

print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 결과=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# 모델 학습 및 평가
logit_model.fit(x_train, y_train)
y_pred = logit_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 스코어: {f1:.3f}')
print(f'ROC AUC 스코어: {roc_auc:.3f}')


CV_Accuracy_Scores: [0.71153846 0.54901961 0.68627451 0.52941176 0.62745098]
CV_Precision_Scores: [0.64102564 1.         0.90909091 0.52       0.70588235]
CV_Recall_Scores: [0.96153846 0.08       0.4        1.         0.46153846]
CV_F1_Scores: [0.76923077 0.14814815 0.55555556 0.68421053 0.55813953]
CV_ROC/AUC: [0.78254438 0.63384615 0.87076923 0.61076923 0.75076923]

CV_Accuracy_mean: 0.621
CV_Precision_mean: 0.755
CV_Recall_mean: 0.581
CV_F1_스코어_mean: 0.543
CV_ROC_AUC+스코어_mean: 0.730

Accuracy: 0.797
Precision: 0.828
Recall: 0.762
F1 스코어: 0.794
ROC AUC 스코어: 0.798


In [6]:
# Random Forest 모델 생성 및 학습
rf_model = RandomForestClassifier(random_state=42, n_estimators=29, min_samples_split=7, min_samples_leaf=8, max_depth=7)

# # Cross Validation
# cv_accuracy = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='accuracy')
# cv_precision = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='precision')
# cv_recall = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='recall')
# cv_f1 = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='f1')
# cv_roc_auc = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='roc_auc')

# print("CV_Accuracy_Scores:", cv_accuracy)
# print("CV_Precision_Scores:", cv_precision)
# print("CV_Recall_Scores:", cv_recall)
# print("CV_F1_Scores:", cv_f1)
# print("CV_ROC/AUC:", cv_roc_auc)

# print('\n=======교차검증 결과=======')
# print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
# print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
# print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
# print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
# print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')


rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_rf:.3f}')
print(f'Precision: {precision_rf:.3f}')
print(f'Recall: {recall_rf:.3f}')
print(f'F1 스코어: {f1_rf:.3f}')
print(f'ROC AUC 스코어: {roc_auc_rf:.3f}')


Accuracy: 0.827
Precision: 0.807
Recall: 0.871
F1 스코어: 0.838
ROC AUC 스코어: 0.826


In [None]:
# AdaBoost 모델 생성 및 학습
adaboost_model = AdaBoostClassifier()

# Cross Validation
cv_accuracy = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

adaboost_model.fit(x_train, y_train)
y_pred_adaboost = adaboost_model.predict(x_test)

accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
precision_adaboost = precision_score(y_test, y_pred_adaboost)
recall_adaboost = recall_score(y_test, y_pred_adaboost)
f1_adaboost = f1_score(y_test, y_pred_adaboost)
roc_auc_adaboost = roc_auc_score(y_test, y_pred_adaboost)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_adaboost:.3f}')
print(f'Precision: {precision_adaboost:.3f}')
print(f'Recall: {recall_adaboost:.3f}')
print(f'F1 스코어: {f1_adaboost:.3f}')
print(f'ROC AUC 스코어: {roc_auc_adaboost:.3f}')

CV_Accuracy_Scores: [0.69230769 0.62745098 0.80392157 0.78431373 0.74509804]
CV_Precision_Scores: [0.65625    0.66666667 0.8        0.85714286 0.84210526]
CV_Recall_Scores: [0.80769231 0.48       0.84       0.69230769 0.61538462]
CV_F1_Scores: [0.72413793 0.55813953 0.82352941 0.76595745 0.71111111]
CV_ROC/AUC: [0.75295858 0.69076923 0.90769231 0.83384615 0.80615385]

CV_Accuracy_mean: 0.731
CV_Precision_mean: 0.764
CV_Recall_mean: 0.687
CV_F1_스코어_mean: 0.717
CV_ROC_AUC+스코어_mean: 0.798

Accuracy: 0.802
Precision: 0.830
Recall: 0.772
F1 스코어: 0.800
ROC AUC 스코어: 0.803


In [None]:
xgboost_model = XGBClassifier()

xgboost_model.fit(x_train, y_train)
y_pred_xgboost = xgboost_model.predict(x_test)

accuracy_xgboost = accuracy_score(y_test, y_pred_xgboost)
precision_xgboost = precision_score(y_test, y_pred_xgboost)
recall_xgboost = recall_score(y_test, y_pred_xgboost)
f1_xgboost = f1_score(y_test, y_pred_xgboost)
roc_auc_xgboost = roc_auc_score(y_test, y_pred_xgboost)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_xgboost:.3f}')
print(f'Precision: {precision_xgboost:.3f}')
print(f'Recall: {recall_xgboost:.3f}')
print(f'F1 스코어: {f1_xgboost:.3f}')
print(f'ROC AUC 스코어: {roc_auc_xgboost:.3f}')


Accuracy: 0.787
Precision: 0.831
Recall: 0.733
F1 스코어: 0.779
ROC AUC 스코어: 0.788


In [None]:
# Bagging 모델 생성 및 학습
bagging_model = BaggingClassifier()

cv_accuracy = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

bagging_model.fit(x_train, y_train)
y_pred_bagging = bagging_model.predict(x_test)

accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging)
recall_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)
roc_auc_bagging = roc_auc_score(y_test, y_pred_bagging)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_bagging:.3f}')
print(f'Precision: {precision_bagging:.3f}')
print(f'Recall: {recall_bagging:.3f}')
print(f'F1 스코어: {f1_bagging:.3f}')
print(f'ROC AUC 스코어: {roc_auc_bagging:.3f}')

CV_Accuracy_Scores: [0.67307692 0.64705882 0.80392157 0.88235294 0.80392157]
CV_Precision_Scores: [0.66666667 0.71428571 0.84       0.9        0.91666667]
CV_Recall_Scores: [0.92307692 0.48       0.76       0.73076923 0.84615385]
CV_F1_Scores: [0.7        0.63636364 0.74418605 0.83333333 0.8       ]
CV_ROC/AUC: [0.7943787  0.74153846 0.82923077 0.91230769 0.90769231]

CV_Accuracy_mean: 0.762
CV_Precision_mean: 0.808
CV_Recall_mean: 0.748
CV_F1_스코어_mean: 0.743
CV_ROC_AUC+스코어_mean: 0.837

Accuracy: 0.787
Precision: 0.824
Recall: 0.743
F1 스코어: 0.781
ROC AUC 스코어: 0.788


In [None]:
# from sklearn.svm import SVC

# # SVM 모델 생성 및 학습
# svm_model = SVC(kernel='linear')

# cv_accuracy = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='accuracy')
# cv_precision = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='precision')
# cv_recall = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='recall')
# cv_f1 = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='f1')
# cv_roc_auc = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='roc_auc')

# print('=======교차검증 결과=======')
# print("CV_Accuracy_Scores:", cv_accuracy)
# print("CV_Precision_Scores:", cv_precision)
# print("CV_Recall_Scores:", cv_recall)
# print("CV_F1_Scores:", cv_f1)
# print("CV_ROC/AUC:", cv_roc_auc)

# print('\n=======교차검증 평균값=======')
# print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
# print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
# print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
# print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
# print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# svm_model.fit(x_train, y_train)
# y_pred_svm = svm_model.predict(x_test)

# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# precision_svm = precision_score(y_test, y_pred_svm)
# recall_svm = recall_score(y_test, y_pred_svm)
# f1_svm = f1_score(y_test, y_pred_svm)
# roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

# print(f'\n=======Test 데이터 평가======')
# print(f'Accuracy: {accuracy_svm:.3f}')
# print(f'Precision: {precision_svm:.3f}')
# print(f'Recall: {recall_svm:.3f}')
# print(f'F1 스코어: {f1_svm:.3f}')
# print(f'ROC AUC 스코어: {roc_auc_svm:.3f}')

In [None]:
# from sklearn.svm import SVC

# # SVM 모델 생성 및 학습
# svm_model = SVC(kernel='rbf')

# cv_accuracy = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='accuracy')
# cv_precision = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='precision')
# cv_recall = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='recall')
# cv_f1 = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='f1')
# cv_roc_auc = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='roc_auc')

# print('=======교차검증 결과=======')
# print("CV_Accuracy_Scores:", cv_accuracy)
# print("CV_Precision_Scores:", cv_precision)
# print("CV_Recall_Scores:", cv_recall)
# print("CV_F1_Scores:", cv_f1)
# print("CV_ROC/AUC:", cv_roc_auc)

# print('\n=======교차검증 평균값=======')
# print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
# print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
# print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
# print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
# print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# svm_model.fit(x_train, y_train)
# y_pred_svm = svm_model.predict(x_test)

# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# precision_svm = precision_score(y_test, y_pred_svm)
# recall_svm = recall_score(y_test, y_pred_svm)
# f1_svm = f1_score(y_test, y_pred_svm)
# roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

# print(f'\n=======Test 데이터 평가======')
# print(f'Accuracy: {accuracy_svm:.3f}')
# print(f'Precision: {precision_svm:.3f}')
# print(f'Recall: {recall_svm:.3f}')
# print(f'F1 스코어: {f1_svm:.3f}')
# print(f'ROC AUC 스코어: {roc_auc_svm:.3f}')


In [None]:
from lightgbm import LGBMClassifier

# LGBM 모델 생성 및 학습
lgbm_model = LGBMClassifier()

cv_accuracy = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

lgbm_model.fit(x_train, y_train)
y_pred_lgbm = lgbm_model.predict(x_test)

accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
precision_lgbm = precision_score(y_test, y_pred_lgbm)
recall_lgbm = recall_score(y_test, y_pred_lgbm)
f1_lgbm = f1_score(y_test, y_pred_lgbm)
roc_auc_lgbm = roc_auc_score(y_test, y_pred_lgbm)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_lgbm:.3f}')
print(f'Precision: {precision_lgbm:.3f}')
print(f'Recall: {recall_lgbm:.3f}')
print(f'F1 스코어: {f1_lgbm:.3f}')
print(f'ROC AUC 스코어: {roc_auc_lgbm:.3f}')


[LightGBM] [Info] Number of positive: 102, number of negative: 102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 732
[LightGBM] [Info] Number of data points in the train set: 204, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 103, number of negative: 102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 740
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502439 -> initscore=0.009756
[LightGBM] [Info] Start training from score 0.009756
[LightGBM] [Info] Number of positive: 103, number of negative: 102
[LightGBM] [Info

#### 딥러닝

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# import tensorflow as tf

In [None]:
# model = Sequential()
# model.add(Dense(256, input_dim = 12, activation = 'relu'))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation = 'relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation = 'sigmoid'))

In [None]:
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# history = model.fit(x_train, y_train, epochs=200, batch_size=5)

# print("\n Accuracy: %.4f" % (model.evaluate(x_test, y_test)[1]))

In [None]:
# # 모델을 사용하여 테스트 데이터에 대한 예측 수행
# y_pred = model.predict(x_test)
# y_pred = binarize(y_pred, threshold=0.5)  # 예측값을 0.5 임계값을 기준으로 이진 분류로 변환

# # 정확도(accuracy) 계산
# accuracy = accuracy_score(y_test, y_pred)
# print(f"accuracy: {accuracy:.4f}")

# # F1 점수(f1 score) 계산
# f1 = f1_score(y_test, y_pred)
# print(f"F1-Score: {f1:.4f}")

# # 재현율(recall) 계산
# recall = recall_score(y_test, y_pred)
# print(f"recall: {recall:.4f}")

# # 정밀도(precision) 계산
# precision = precision_score(y_test, y_pred)
# print(f"precision: {precision:.4f}")