In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.stats.mstats import winsorize
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from matplotlib import font_manager, rc
import warnings

# 한글 폰트 설정
font_path = "C:/Windows/Fonts/malgun.ttf"  # 한글 폰트 파일 경로
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

# 경고 메시지 무시
warnings.filterwarnings('ignore')

In [7]:
# DNN 모델 생성 함수
def create_dnn_model(input_dim):
    model = Sequential([
        Dense(len(final_features), activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['recall'])
    return model

### 최종피쳐 모델링

In [8]:
final_features =['차입금의존도', '3개년가중평균매출액', '자기자본구성비율', '배당성향', '자기자본순이익률', '자본금순이익률',
                 '기업순이익률', '평균배당률', '금융비용 대 총비용비율', '자본금회전률','매출액영업이익률', '매출액순이익률', 
                 '인건비(백만원)', '사내유보율', '자기자본증가율', '당좌비율', '자기자본회전률']

X_train = pd.read_csv('../datasets/비상장_train.csv', index_col = 0)[final_features]
y_train = pd.read_csv('../datasets/비상장_train.csv', index_col = 0)['label']
X_test = pd.read_csv('../datasets/비상장_test.csv', index_col = 0)[final_features]
y_test = pd.read_csv('../datasets/비상장_test.csv', index_col = 0)['label']

# 모델 설정 및 하이퍼파라미터
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(verbosity=0),
    'LightGBM': LGBMClassifier(verbose=1),
    'CatBoost': CatBoostClassifier(verbose=0)
}

param_grids = {
    'LogisticRegression': {'C': [0.1, 1, 10, 43], 'penalty': ['l1', 'l2']},
    'DecisionTree': {'max_depth': [10, 20, 30], 'min_samples_leaf': [3, 5, 7]},
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [5, 10, 20], 'min_samples_leaf': [3, 5, 8]},
    'XGBoost': {'n_estimators': [100, 150, 200], 'learning_rate': [0.01, 0.05], 'max_depth': [3, 4, 10]},
    'LightGBM': {'max_depth': [5, 7, 9], 'learning_rate': [0.01, 0.05], 'num_leaves': [10, 20, 31]},
    'CatBoost': {'iterations': [200, 250, 300], 'learning_rate': [0.01, 0.05], 'depth': [5, 7, 9]}
}


results = {}

grid_results = {}

results_deep = {}

# 각 모델에 대해 K-fold 교차 검증을 수행하고 결과를 저장합니다.
for model_name, model in models.items():
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())  # y_train을 array로 변환
    best_model = grid_search.best_estimator_
    
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # 결과 저장
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'mean_cv_score': grid_search.best_score_,
        'Train': {
            'Accuracy': accuracy_score(y_train, y_train_pred),
            'Precision': precision_score(y_train, y_train_pred),
            'Recall': recall_score(y_train, y_train_pred),
            'F1': f1_score(y_train, y_train_pred)
        },
        'Test': {
            'Accuracy': accuracy_score(y_test, y_test_pred),
            'Precision': precision_score(y_test, y_test_pred),
            'Recall': recall_score(y_test, y_test_pred),
            'F1': f1_score(y_test, y_test_pred)
        }
    }


# DNN 모델 초기화 및 훈련
dnn_model = create_dnn_model(X_train.shape[1])
dnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# DNN 모델 성능 평가
dnn_train_preds = (dnn_model.predict(X_train) > 0.5).astype(int)
dnn_test_preds = (dnn_model.predict(X_test) > 0.5).astype(int)

dnn_train_accuracy = accuracy_score(y_train, dnn_train_preds)
dnn_train_precision = precision_score(y_train, dnn_train_preds)
dnn_train_recall = recall_score(y_train, dnn_train_preds)
dnn_train_f1 = f1_score(y_train, dnn_train_preds)
dnn_train_confusion = confusion_matrix(y_train, dnn_train_preds)

dnn_test_accuracy = accuracy_score(y_test, dnn_test_preds)
dnn_test_precision = precision_score(y_test, dnn_test_preds)
dnn_test_recall = recall_score(y_test, dnn_test_preds)
dnn_test_f1 = f1_score(y_test, dnn_test_preds)
dnn_test_confusion = confusion_matrix(y_test, dnn_test_preds)

# DNN 결과 추가
results_deep['DNN'] = {
    'Train': {'Accuracy': dnn_train_accuracy, 'Precision': dnn_train_precision, 'Recall': dnn_train_recall, 'F1': dnn_train_f1, 'Confusion Matrix': dnn_train_confusion},
    'Test': {'Accuracy': dnn_test_accuracy, 'Precision': dnn_test_precision, 'Recall': dnn_test_recall, 'F1': dnn_test_f1, 'Confusion Matrix': dnn_test_confusion}
}

# 결과 출력
for model_name, model_results in results.items():
    print(f'\n{model_name} Results:')
    print(f'Best Hyperparameters: {model_results["best_params"]}')
    print(f'Mean CV Score: {model_results["mean_cv_score"]:.4f}')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')

                
# 하이퍼 파라미터 결과 출력
for model_name, model_results in grid_results.items():
    best_params = model_results['best_params']  # 최적 파라미터 추출
    mean_cv_score = model_results['mean_cv_score']
    print(f'{model_name} 최적 하이퍼파라미터:')
    print(best_params)
    print(f'{model_name} 교차 검증 정확도:')
    print(f'{mean_cv_score:.4f}')
    print()  # 출력을 구분하기 위한 빈 줄 추가

# DNN 결과 출력    

for model_name, model_results in results_deep.items():
    print(f'\n{model_name} Results:')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')

[LightGBM] [Info] Number of positive: 9326, number of negative: 81360
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 90686, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.102838 -> initscore=-2.166078
[LightGBM] [Info] Start training from score -2.166078
[1m2834/2834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 850us/step
[1m1215/1215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 942us/step

LogisticRegression Results:
Best Hyperparameters: {'C': 1, 'penalty': 'l2'}
Mean CV Score: 0.2858
  Train Metrics:
    Accuracy: 0.9132
    Precision: 0.6868
    Recall: 0.2862
    F1 Score: 0.4040
  Test Metrics:
    Accuracy: 0.9099
    Precision: 0.6292
    Recall: 0.3015
    F1 Score: 0.4076

RandomForest Results:
Best Hyperparameters: {'max_