In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.stats.mstats import winsorize
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import warnings

# 경고 메시지 무시
warnings.filterwarnings('ignore')

In [2]:
# DNN 모델 생성 함수
def create_dnn_model(input_dim):
    model = Sequential([
        Dense(len(final_features), activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dense(62, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['recall'])
    return model

### 최종피쳐 모델링

In [3]:
final_features = ['재고자산증가율', 
                  '매출액정상영업이익률', '매출액순이익률', '자기자본순이익률', '자본금순이익률', '기업순이익률', '금융비용 대 총비용비율', 
                  '차입금의존도', 
                  '자본금회전률', 
                  '설비투자효율', 
                  '부채대비영업활동현금흐름', 
                  'PER']

X_train = pd.read_csv('../datasets/상장_train.csv', index_col = 0)[final_features]
y_train = pd.read_csv('../datasets/상장_train.csv', index_col = 0)['label']
X_test = pd.read_csv('../datasets/상장_test.csv', index_col = 0)[final_features]
y_test = pd.read_csv('../datasets/상장_test.csv', index_col = 0)['label']

# 모델 설정 및 하이퍼파라미터
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(), 
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(verbosity=0),
    'LightGBM': LGBMClassifier(verbose=1),
    'CatBoost': CatBoostClassifier(verbose=0)
}

param_grids = {
    'LogisticRegression': {'C': [0.1, 1, 10, 43], 'penalty': ['l1', 'l2']},
    'DecisionTree': {'max_depth': [10, 20, 30], 'min_samples_leaf': [3, 5, 7]},
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [5, 10, 20], 'min_samples_leaf': [3, 5, 8]},
    'XGBoost': {'n_estimators': [100, 150, 200], 'learning_rate': [0.01, 0.05], 'max_depth': [3, 4, 10]},
    'LightGBM': {'max_depth': [5, 7, 9], 'learning_rate': [0.01, 0.05], 'num_leaves': [10, 20, 31]},
    'CatBoost': {'iterations': [200, 250, 300], 'learning_rate': [0.01, 0.05], 'depth': [5, 7, 9]}
}

results = {}

grid_results = {}

results_deep = {}

# 각 모델에 대해 K-fold 교차 검증을 수행하고 결과를 저장합니다.
for model_name, model in models.items():
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())  # y_train을 array로 변환
    best_model = grid_search.best_estimator_
    
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # 결과 저장
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'mean_cv_score': grid_search.best_score_,
        'Train': {
            'Accuracy': accuracy_score(y_train, y_train_pred),
            'Precision': precision_score(y_train, y_train_pred),
            'Recall': recall_score(y_train, y_train_pred),
            'F1': f1_score(y_train, y_train_pred)
        },
        'Test': {
            'Accuracy': accuracy_score(y_test, y_test_pred),
            'Precision': precision_score(y_test, y_test_pred),
            'Recall': recall_score(y_test, y_test_pred),
            'F1': f1_score(y_test, y_test_pred)
        }
    }

# DNN 모델 초기화 및 훈련
dnn_model = create_dnn_model(X_train.shape[1])
dnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# DNN 모델 성능 평가
dnn_train_preds = (dnn_model.predict(X_train) > 0.5).astype(int)
dnn_test_preds = (dnn_model.predict(X_test) > 0.5).astype(int)

dnn_train_accuracy = accuracy_score(y_train, dnn_train_preds)
dnn_train_precision = precision_score(y_train, dnn_train_preds)
dnn_train_recall = recall_score(y_train, dnn_train_preds)
dnn_train_f1 = f1_score(y_train, dnn_train_preds)
dnn_train_confusion = confusion_matrix(y_train, dnn_train_preds)

dnn_test_accuracy = accuracy_score(y_test, dnn_test_preds)
dnn_test_precision = precision_score(y_test, dnn_test_preds)
dnn_test_recall = recall_score(y_test, dnn_test_preds)
dnn_test_f1 = f1_score(y_test, dnn_test_preds)
dnn_test_confusion = confusion_matrix(y_test, dnn_test_preds)

# DNN 결과 추가
results_deep['DNN'] = {
    'Train': {'Accuracy': dnn_train_accuracy, 'Precision': dnn_train_precision, 'Recall': dnn_train_recall, 'F1': dnn_train_f1, 'Confusion Matrix': dnn_train_confusion},
    'Test': {'Accuracy': dnn_test_accuracy, 'Precision': dnn_test_precision, 'Recall': dnn_test_recall, 'F1': dnn_test_f1, 'Confusion Matrix': dnn_test_confusion}
}

# 결과 출력
for model_name, model_results in results.items():
    print(f'\n{model_name} Results:')
    print(f'Best Hyperparameters: {model_results["best_params"]}')
    print(f'Mean CV Score: {model_results["mean_cv_score"]:.4f}')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')
               
# 하이퍼 파라미터 결과 출력
for model_name, model_results in grid_results.items():
    best_params = model_results['best_params']  # 최적 파라미터 추출
    mean_cv_score = model_results['mean_cv_score']
    print(f'{model_name} 최적 하이퍼파라미터:')
    print(best_params)
    print(f'{model_name} 교차 검증 정확도:')
    print(f'{mean_cv_score:.4f}')
    print()  # 출력을 구분하기 위한 빈 줄 추가

# DNN 결과 출력    

for model_name, model_results in results_deep.items():
    print(f'\n{model_name} Results:')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')

[LightGBM] [Info] Number of positive: 521, number of negative: 4450
[LightGBM] [Info] Number of positive: 494, number of negative: 4477
[LightGBM] [Info] Number of positive: 515, number of negative: 4455
[LightGBM] [Info] Number of positive: 508, number of negative: 4462
[LightGBM] [Info] Number of positive: 514, number of negative: 4456
[LightGBM] [Info] Number of positive: 514, number of negative: 4456
[LightGBM] [Info] Number of positive: 515, number of negative: 4455
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002267 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wi

In [4]:
final_features = ['재고자산증가율', 
                  '매출액정상영업이익률', '매출액순이익률', '자기자본순이익률', '자본금순이익률', '기업순이익률', '금융비용 대 총비용비율', 
                  '차입금의존도', 
                  '자본금회전률', 
                  '설비투자효율', 
                  '부채대비영업활동현금흐름', 
                  'PER']

X_train = pd.read_csv('../datasets/상장_train.csv', index_col = 0)[final_features]
y_train = pd.read_csv('../datasets/상장_train.csv', index_col = 0)['label']
X_test = pd.read_csv('../datasets/상장_test.csv', index_col = 0)[final_features]
y_test = pd.read_csv('../datasets/상장_test.csv', index_col = 0)['label']

# 모델 설정 및 하이퍼파라미터
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(), 
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(verbosity=0),
    'LightGBM': LGBMClassifier(verbose=1),
    'CatBoost': CatBoostClassifier(verbose=0)
}

param_grids = {
    'LogisticRegression': {'C': [5, 10, 15], 'penalty': ['l1', 'l2']},
    'DecisionTree': {'max_depth': [10, 20, 30], 'min_samples_leaf': [3, 5, 7]},
    'RandomForest': {'n_estimators': [30, 50, 70], 'max_depth': [15, 20, 25], 'min_samples_leaf': [2, 3, 4]},
    'XGBoost': {'n_estimators': [75, 100, 125], 'learning_rate': [0.025, 0.05, 0.075], 'max_depth': [5, 10, 15]},
    'LightGBM': {'max_depth': [6, 7, 8], 'learning_rate': [0.025, 0.05, 0.075], 'num_leaves': [15, 20, 25]},
    'CatBoost': {'iterations': [275, 300, 325], 'learning_rate': [0.025, 0.05, 0.075], 'depth': [6, 7, 8]}
}

results = {}

grid_results = {}

results_deep = {}

# 각 모델에 대해 K-fold 교차 검증을 수행하고 결과를 저장합니다.
for model_name, model in models.items():
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())  # y_train을 array로 변환
    best_model = grid_search.best_estimator_
    
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # 결과 저장
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'mean_cv_score': grid_search.best_score_,
        'Train': {
            'Accuracy': accuracy_score(y_train, y_train_pred),
            'Precision': precision_score(y_train, y_train_pred),
            'Recall': recall_score(y_train, y_train_pred),
            'F1': f1_score(y_train, y_train_pred)
        },
        'Test': {
            'Accuracy': accuracy_score(y_test, y_test_pred),
            'Precision': precision_score(y_test, y_test_pred),
            'Recall': recall_score(y_test, y_test_pred),
            'F1': f1_score(y_test, y_test_pred)
        }
    }

# DNN 모델 초기화 및 훈련
dnn_model = create_dnn_model(X_train.shape[1])
dnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# DNN 모델 성능 평가
dnn_train_preds = (dnn_model.predict(X_train) > 0.5).astype(int)
dnn_test_preds = (dnn_model.predict(X_test) > 0.5).astype(int)

dnn_train_accuracy = accuracy_score(y_train, dnn_train_preds)
dnn_train_precision = precision_score(y_train, dnn_train_preds)
dnn_train_recall = recall_score(y_train, dnn_train_preds)
dnn_train_f1 = f1_score(y_train, dnn_train_preds)
dnn_train_confusion = confusion_matrix(y_train, dnn_train_preds)

dnn_test_accuracy = accuracy_score(y_test, dnn_test_preds)
dnn_test_precision = precision_score(y_test, dnn_test_preds)
dnn_test_recall = recall_score(y_test, dnn_test_preds)
dnn_test_f1 = f1_score(y_test, dnn_test_preds)
dnn_test_confusion = confusion_matrix(y_test, dnn_test_preds)

# DNN 결과 추가
results_deep['DNN'] = {
    'Train': {'Accuracy': dnn_train_accuracy, 'Precision': dnn_train_precision, 'Recall': dnn_train_recall, 'F1': dnn_train_f1, 'Confusion Matrix': dnn_train_confusion},
    'Test': {'Accuracy': dnn_test_accuracy, 'Precision': dnn_test_precision, 'Recall': dnn_test_recall, 'F1': dnn_test_f1, 'Confusion Matrix': dnn_test_confusion}
}

# 결과 출력
for model_name, model_results in results.items():
    print(f'\n{model_name} Results:')
    print(f'Best Hyperparameters: {model_results["best_params"]}')
    print(f'Mean CV Score: {model_results["mean_cv_score"]:.4f}')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')
               
# 하이퍼 파라미터 결과 출력
for model_name, model_results in grid_results.items():
    best_params = model_results['best_params']  # 최적 파라미터 추출
    mean_cv_score = model_results['mean_cv_score']
    print(f'{model_name} 최적 하이퍼파라미터:')
    print(best_params)
    print(f'{model_name} 교차 검증 정확도:')
    print(f'{mean_cv_score:.4f}')
    print()  # 출력을 구분하기 위한 빈 줄 추가

# DNN 결과 출력    

for model_name, model_results in results_deep.items():
    print(f'\n{model_name} Results:')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')

[LightGBM] [Info] Number of positive: 508, number of negative: 4462
[LightGBM] [Info] Number of positive: 515, number of negative: 4455
[LightGBM] [Info] Number of positive: 515, number of negative: 4455
[LightGBM] [Info] Number of positive: 494, number of negative: 4477
[LightGBM] [Info] Number of positive: 521, number of negative: 4450
[LightGBM] [Info] Number of positive: 514, number of negative: 4456[LightGBM] [Info] Number of positive: 514, number of negative: 4456

[LightGBM] [Info] Number of positive: 508, number of negative: 4462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002534 seconds.
You can set `force_col_wise=true` to remove the overhead.[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002528 seconds.
You can set `force_col_wise=t

In [5]:
final_features = ['매출액증가율', '재고자산증가율', 
                  '매출액정상영업이익률', '매출액순이익률', '자기자본순이익률', '자본금순이익률', '기업순이익률', '금융비용 대 총비용비율', 
                  '자기자본구성비율', '당좌비율', '차입금의존도', 
                  '자본금회전률', 
                  '설비투자효율', 
                  '부채대비영업활동현금흐름', 
                  'PER']

X_train = pd.read_csv('../datasets/상장_train.csv', index_col = 0)[final_features]
y_train = pd.read_csv('../datasets/상장_train.csv', index_col = 0)['label']
X_test = pd.read_csv('../datasets/상장_test.csv', index_col = 0)[final_features]
y_test = pd.read_csv('../datasets/상장_test.csv', index_col = 0)['label']

# 모델 설정 및 하이퍼파라미터
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(), 
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(verbosity=0),
    'LightGBM': LGBMClassifier(verbose=1),
    'CatBoost': CatBoostClassifier(verbose=0)
}

param_grids = {
    'LogisticRegression': {'C': [0.1, 1, 10, 43], 'penalty': ['l1', 'l2']},
    'DecisionTree': {'max_depth': [10, 20, 30], 'min_samples_leaf': [3, 5, 7]},
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [5, 10, 20], 'min_samples_leaf': [3, 5, 8]},
    'XGBoost': {'n_estimators': [100, 150, 200], 'learning_rate': [0.01, 0.05], 'max_depth': [3, 4, 10]},
    'LightGBM': {'max_depth': [5, 7, 9], 'learning_rate': [0.01, 0.05], 'num_leaves': [10, 20, 31]},
    'CatBoost': {'iterations': [200, 250, 300], 'learning_rate': [0.01, 0.05], 'depth': [5, 7, 9]}
}

results = {}

grid_results = {}

results_deep = {}

# 각 모델에 대해 K-fold 교차 검증을 수행하고 결과를 저장합니다.
for model_name, model in models.items():
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())  # y_train을 array로 변환
    best_model = grid_search.best_estimator_
    
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # 결과 저장
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'mean_cv_score': grid_search.best_score_,
        'Train': {
            'Accuracy': accuracy_score(y_train, y_train_pred),
            'Precision': precision_score(y_train, y_train_pred),
            'Recall': recall_score(y_train, y_train_pred),
            'F1': f1_score(y_train, y_train_pred)
        },
        'Test': {
            'Accuracy': accuracy_score(y_test, y_test_pred),
            'Precision': precision_score(y_test, y_test_pred),
            'Recall': recall_score(y_test, y_test_pred),
            'F1': f1_score(y_test, y_test_pred)
        }
    }

# DNN 모델 초기화 및 훈련
dnn_model = create_dnn_model(X_train.shape[1])
dnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

# DNN 모델 성능 평가
dnn_train_preds = (dnn_model.predict(X_train) > 0.5).astype(int)
dnn_test_preds = (dnn_model.predict(X_test) > 0.5).astype(int)

dnn_train_accuracy = accuracy_score(y_train, dnn_train_preds)
dnn_train_precision = precision_score(y_train, dnn_train_preds)
dnn_train_recall = recall_score(y_train, dnn_train_preds)
dnn_train_f1 = f1_score(y_train, dnn_train_preds)
dnn_train_confusion = confusion_matrix(y_train, dnn_train_preds)

dnn_test_accuracy = accuracy_score(y_test, dnn_test_preds)
dnn_test_precision = precision_score(y_test, dnn_test_preds)
dnn_test_recall = recall_score(y_test, dnn_test_preds)
dnn_test_f1 = f1_score(y_test, dnn_test_preds)
dnn_test_confusion = confusion_matrix(y_test, dnn_test_preds)

# DNN 결과 추가
results_deep['DNN'] = {
    'Train': {'Accuracy': dnn_train_accuracy, 'Precision': dnn_train_precision, 'Recall': dnn_train_recall, 'F1': dnn_train_f1, 'Confusion Matrix': dnn_train_confusion},
    'Test': {'Accuracy': dnn_test_accuracy, 'Precision': dnn_test_precision, 'Recall': dnn_test_recall, 'F1': dnn_test_f1, 'Confusion Matrix': dnn_test_confusion}
}

# 결과 출력
for model_name, model_results in results.items():
    print(f'\n{model_name} Results:')
    print(f'Best Hyperparameters: {model_results["best_params"]}')
    print(f'Mean CV Score: {model_results["mean_cv_score"]:.4f}')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')
               
# 하이퍼 파라미터 결과 출력
for model_name, model_results in grid_results.items():
    best_params = model_results['best_params']  # 최적 파라미터 추출
    mean_cv_score = model_results['mean_cv_score']
    print(f'{model_name} 최적 하이퍼파라미터:')
    print(best_params)
    print(f'{model_name} 교차 검증 정확도:')
    print(f'{mean_cv_score:.4f}')
    print()  # 출력을 구분하기 위한 빈 줄 추가

# DNN 결과 출력    

for model_name, model_results in results_deep.items():
    print(f'\n{model_name} Results:')
    for dataset in ['Train', 'Test']:
        print(f'  {dataset} Metrics:')
        metrics = model_results[dataset]
        print(f'    Accuracy: {metrics["Accuracy"]:.4f}')
        print(f'    Precision: {metrics["Precision"]:.4f}')
        print(f'    Recall: {metrics["Recall"]:.4f}')
        print(f'    F1 Score: {metrics["F1"]:.4f}')

[LightGBM] [Info] Number of positive: 515, number of negative: 4455
[LightGBM] [Info] Number of positive: 521, number of negative: 4450
[LightGBM] [Info] Number of positive: 508, number of negative: 4462
[LightGBM] [Info] Number of positive: 494, number of negative: 4477[LightGBM] [Info] Number of positive: 514, number of negative: 4456

[LightGBM] [Info] Number of positive: 515, number of negative: 4455[LightGBM] [Info] Number of positive: 508, number of negative: 4462
[LightGBM] [Info] Number of positive: 514, number of negative: 4456

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

[Lig