In [14]:
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [15]:
df_train=pd.read_excel('./train.xlsx',index_col=0)
df_test = pd.read_excel('./test.xlsx',index_col=0)
df_train = df_train.reset_index()
df_test = df_test.reset_index()
df_train = df_train.drop(['index','회사명', '거래소코드', '회계년도'],axis=1)
df_test = df_test.drop(['index','회사명', '거래소코드', '회계년도'],axis=1)

In [16]:
selected_columns =  ['유동비율(%)', '당좌비율(%)', '비유동비율(%)', '부채비율(%)', '유동부채비율(%)', '차입금의존도(%)',
       '이익잉여금비율(%)', '순운전자본비율(%)', '비유동장기적합률(%)', '이자보상배율(이자비용)(배)',
       '외화포지션(배)', '총자본순이익률(%)', '자기자본순이익률(%)', '경영자본순이익률(%)', '매출액순이익률(%)',
       '총자본정상영업이익률(%)', '자기자본정상영업이익률(%)', '매출액정상영업이익률(%)', '금융비용부담률(%)',
       '총자산회전율(배)', '매출채권회전률(배)', '당좌자산회전률(배)', '재고자산회전률(배)', '유형자산회전율(배)',
       '매입채무회전률(배)', '자기자본회전률(배)', '경영자본회전률(배)', '유동자산증가율(%)', '매출액증가율(%)',
       '정상영업이익증가율(%)', '순이익증가율(%)', 'spread(%)', 'PPI(2015기준)', '실질GDP성장률(%)',
       'EV/EBITDA(배)', '52주베타(배)', 'PER', 'PBR', 'PCR', 'CASH FLOW 대 부채비율(%)',
       '총자본투자효율(%)', 'log_평균총자산', 'K2_score', '최대주주특수관계인_지분율(%)',
       '최대주주변경횟수(회)', '감사법인변경횟수(회)', '업력(년)', '평균근속연수(년)', '직원수증감율(%)',
       '기간제비율(%)', 'R&D집중도(%)', '수출기업여부(dummy)']

In [17]:
X_train = df_train[['차입금의존도(%)',
 '순운전자본비율(%)',
 '총자본순이익률(%)',
 '자기자본정상영업이익률(%)',
 '금융비용부담률(%)',
 '총자산회전율(배)',
 'log_평균총자산',
 'K2_score',
 '최대주주특수관계인_지분율(%)',
 '감사법인변경횟수(회)']]
y_train = df_train['Target']
X_test = df_test[['차입금의존도(%)',
 '순운전자본비율(%)',
 '총자본순이익률(%)',
 '자기자본정상영업이익률(%)',
 '금융비용부담률(%)',
 '총자산회전율(배)',
 'log_평균총자산',
 'K2_score',
 '최대주주특수관계인_지분율(%)',
 '감사법인변경횟수(회)']]
y_test = df_test['Target']

In [18]:
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()
scaler_robust = RobustScaler()

In [19]:
dt_classifier = DecisionTreeClassifier(random_state=42)
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)
lgbm_classifier = LGBMClassifier(random_state=42)
logistic_regression = LogisticRegression(random_state=42)

In [20]:
classifiers = [
    ("Decision Tree", dt_classifier),
    ("Random Forest", rf_classifier),
    ("XGBoost", xgb_classifier),
    ("LightGBM", lgbm_classifier),
    ("Logistic Regression", logistic_regression)
]

In [21]:
scaling_methods = [
    ("MinMaxScaler", scaler_minmax),
    # ("StandardScaler", scaler_standard),
    # ("RobustScaler", scaler_robust)
]


In [22]:
param_grids = {
    'Decision Tree': {
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    },
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l1', 'l2']
    }
}


In [23]:
for scaler_name, scaler in scaling_methods:
    print(f"Scaling method: {scaler_name}")
    
    for clf_name, clf in classifiers:
        pipeline = ImbPipeline([
            ('scaler', scaler),
            ('smote', SMOTE(random_state=42,sampling_strategy=1)),
            ('classifier', clf)
        ])

        
        if clf_name in param_grids:
                param_grid = param_grids[clf_name]
                grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=5, verbose=1)
                grid_search.fit(X_train, y_train)

                print(f"Best parameters found for {clf_name}:")
                print(grid_search.best_params_)
                print("\nBest F1 score found:")
                print(grid_search.best_score_)

                # 최적화된 모델로 예측 및 평가
                best_model = grid_search.best_estimator_
                y_pred = best_model.predict(X_test)
                report = classification_report(y_test, y_pred)
                print(report)
                print("=" * 50)

Scaling method: MinMaxScaler
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters found for Decision Tree:
{'classifier__max_depth': 10, 'classifier__min_samples_split': 5}

Best F1 score found:
0.4595728202048008
              precision    recall  f1-score   support

           0       0.97      0.83      0.89      3283
           1       0.22      0.63      0.32       244

    accuracy                           0.82      3527
   macro avg       0.59      0.73      0.61      3527
weighted avg       0.92      0.82      0.85      3527

Fitting 5 folds for each of 27 candidates, totalling 135 fits
