In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings

In [2]:
data = pd.read_csv("data.csv")

In [3]:
data['Bankrupt?'].value_counts() #불균형 데이터

0    6599
1     220
Name: Bankrupt?, dtype: int64

In [5]:
#over sampling - SMOTE

from imblearn.over_sampling import *

X = data.iloc[:,1:]
y = data.iloc[:,0]

X_res, y_res = SMOTE(random_state=50).fit_resample(X,y)

print('the shape of X_res: ', X_res.shape)
print('the shape of y_res: ', y_res.shape)

print('counts of label 1: ', sum(y_res==1))
print('counts of label 0: ', sum(y_res==0))

the shape of X_res:  (13198, 95)
the shape of y_res:  (13198,)
counts of label 1:  6599
counts of label 0:  6599


In [6]:
#2) 피쳐 스케일링

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_sc = scaler.fit_transform(X_res)
X_res_scaled = pd.DataFrame(X_sc, columns=X.columns)
X_res_scaled.head()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,-1.207936,-0.93872,-1.258199,-0.11667,-0.117637,0.024488,-0.007381,-0.004163,-0.072419,-0.018256,...,-0.914078,-0.074778,-0.094828,-0.116814,-0.126592,0.090532,-0.092783,-0.123352,0.0,-0.428297
1,0.010306,0.301795,0.088997,0.451067,0.450975,0.022007,0.045342,0.04537,0.037382,0.038315,...,0.317506,-0.074778,-0.012923,0.45127,0.147421,-0.07447,16.550281,0.473038,0.0,-0.335235
2,-0.486625,-0.125374,-0.450401,-0.117136,-0.123706,0.012542,-0.059112,-0.046687,-0.146173,-0.094343,...,-0.006316,-0.074778,0.007139,-0.11707,0.074943,0.090185,-0.095994,-0.156852,0.0,-0.428181
3,-0.827632,-0.645821,-0.627168,-1.275452,-1.278203,-0.004291,0.001155,0.011587,0.012418,0.00955,...,-0.557576,-0.074778,-0.089555,-1.275568,0.027822,-0.129631,-0.086069,-0.063633,0.0,-0.266647
4,0.019814,0.304172,0.156584,-0.289602,-0.290835,0.024908,0.043833,0.045672,0.027554,0.043099,...,0.313091,-0.074778,-0.026812,-0.28962,0.1475,-0.212897,-0.222079,1.00291,0.0,-0.019062


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res_scaled, y_res, test_size = 0.3, random_state=1)

In [11]:
from sklearn import neighbors  #KNN
from sklearn.svm import SVC    #SVC
from sklearn.linear_model import LogisticRegression  #LogisticRegression
from sklearn.tree import DecisionTreeClassifier  #DecisionTree
from sklearn.naive_bayes import GaussianNB   #가우시안
from sklearn.ensemble import RandomForestClassifier #RandomForest
from sklearn.ensemble import AdaBoostClassifier   #Adaboost
from sklearn.ensemble import GradientBoostingClassifier   #GradientBoost
from xgboost import XGBClassifier   #XGBoost
from lightgbm import LGBMClassifier  #Light GBM
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

knn = neighbors.KNeighborsClassifier()
lr = LogisticRegression()
tree = DecisionTreeClassifier()
GNB = GaussianNB()
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()

for model in [knn, lr, tree, GNB, rf, ada, gb, xgb, lgbm]:
    model.fit(X_train, y_train)
    pred_model = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred_model)
    
    print('{0}: {1: .4f}'.format(model, accuracy))

KNeighborsClassifier():  0.9389
LogisticRegression():  0.9000
DecisionTreeClassifier():  0.9442
GaussianNB():  0.6705
RandomForestClassifier():  0.9747
AdaBoostClassifier():  0.9321
GradientBoostingClassifier():  0.9518
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None):  0.9826
LGBMClassifier():  0.9818


In [13]:
#1) Randomforest

from sklearn.model_selection import GridSearchCV

params = {'n_estimators':[100],
         'min_samples_leaf':[1,2,4],
         'min_samples_split':[2,4,8]}

rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, param_grid=params, cv=2)
grid_rf.fit(X_train, y_train)

print(grid_rf.best_params_)
print(grid_rf.best_score_)

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
0.9641697337085949


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pred_rf = grid_rf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rf)
precision = precision_score(y_test, pred_rf)
recall = recall_score(y_test, pred_rf)
f1 = f1_score(y_test, pred_rf)
                          
print('정확도: {0: .4f}, 정밀도: {1: .4f}, 재현율: {2: .4f}, f1 score: {3: .4f} '.format(accuracy, precision, recall, f1))

정확도:  0.9753, 정밀도:  0.9564, 재현율:  0.9954, f1 score:  0.9755 


In [17]:
#2) XGBoost

params = {'n_estimators':[500],
         'learning_rate':[0.1, 0.3,0.5],
         'max_depth':[4,6,8]}

xgb = XGBClassifier()
grid_xgb = GridSearchCV(xgb, param_grid=params, cv=2)
grid_xgb.fit(X_train, y_train)

print(grid_xgb.best_params_)
print(grid_xgb.best_score_)

{'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 500}
0.9768348127300281


In [18]:
pred_xgb = grid_xgb.predict(X_test)
accuracy = accuracy_score(y_test, pred_xgb)
precision = precision_score(y_test, pred_xgb)
recall = recall_score(y_test, pred_xgb)
f1 = f1_score(y_test, pred_xgb)
                          
print('정확도: {0: .4f}, 정밀도: {1: .4f}, 재현율: {2: .4f}, f1 score: {3: .4f} '.format(accuracy, precision, recall, f1))

정확도:  0.9841, 정밀도:  0.9707, 재현율:  0.9980, f1 score:  0.9842 


In [21]:
#2) LGBM

grid_lgbm = LGBMClassifier(learning_rate=0.3, max_depth=6, n_estimators=500)
grid_lgbm.fit(X_train, y_train)

LGBMClassifier(learning_rate=0.3, max_depth=6, n_estimators=500)

In [22]:
pred_lgbm = grid_lgbm.predict(X_test)
accuracy = accuracy_score(y_test, pred_lgbm)
precision = precision_score(y_test, pred_lgbm)
recall = recall_score(y_test, pred_lgbm)
f1 = f1_score(y_test, pred_lgbm)
      
print('정확도: {0: .4f}, 정밀도: {1: .4f}, 재현율: {2: .4f}, f1 score: {3: .4f} '.format(accuracy, precision, recall, f1))

정확도:  0.9902, 정밀도:  0.9805, 재현율:  1.0000, f1 score:  0.9902 
