In [13]:
import imb_modeling
import pandas as pd
import numpy as np
import time

from sklearn import svm, datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, roc_auc_score

from xgboost import XGBClassifier

Load dataset

In [2]:
svd8 = pd.read_csv('data/svd_8.csv',index_col=0)
svd_z5 = pd.read_csv("data/svd_z_5.csv",index_col=0)
svd_z9 = pd.read_csv("data/svd_z_9.csv",index_col=0)

In [3]:
svd8.shape, svd_z5.shape, svd_z9.shape

((6819, 9), (6819, 6), (6819, 10))

In [4]:
svd8.head()

Unnamed: 0,axis1,axis2,axis3,axis4,axis5,axis6,axis7,axis8,Bankrupt
0,5995932000.0,2565263000.0,1998551000.0,-1556894000.0,-2872628000.0,-1131120000.0,-619952800.0,-3087006000.0,1
1,10464120000.0,-2025294000.0,7646973000.0,2906900000.0,-1599878000.0,-1508080000.0,-205418800.0,-2352258000.0,1
2,5855594000.0,-2207877000.0,-2404231000.0,-1810148000.0,-2697782000.0,-1499563000.0,1818762000.0,-390834900.0,1
3,10717620000.0,1341745000.0,6819712000.0,743015300.0,-4009937000.0,2112051000.0,7794637000.0,4507580000.0,1
4,9244816000.0,7462440000.0,-1875104000.0,3030693000.0,-2042418000.0,-325425500.0,-359603400.0,2267177000.0,1


Params

In [5]:
# SVM parameter dictionary
SVM_params = {'kernel':('linear', 'rbf'), 
              'C':[1, 10]}

# Parameters for Decision treee
dt_params = {'criterion':['gini','entropy'], 
              'max_depth' : range(1,10), 
              'min_samples_split' : range(1,10), 
              'min_samples_leaf': range(1,5)}

# Parameters for Random Forest
rf_params = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)], # Number of trees in random forest
               'max_features': ['auto', 'sqrt'], # Number of features to consider at every split
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], # Maximum number of levels in tree
               'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
               'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
               'bootstrap': [True, False] # Method of selecting samples for training each tree
                }

#Parameters for gradient boosting
gb_params = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

#Parameters for XGBoost
xgb_estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=421
)
xgb_params = {
    'max_depth': range(2, 10, 1),
    'n_estimators': range(60, 220, 40),
    "min_child_weight":[1, 3, 5],
    'learning_rate': [0.1, 0.01, 0.05]
}

In [6]:
models = [(svm.SVC(),SVM_params),
          (DecisionTreeClassifier(),dt_params),
          (RandomForestClassifier(),rf_params),
          (GradientBoostingClassifier(),gb_params),
          (xgb_estimator,xgb_params)]

In [7]:
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall':make_scorer(recall_score),
           'roc_auc':make_scorer(roc_auc_score)}

Modeling

In [14]:
res = []
for model, param in models:
    start = time.time()
    print('Training: ' + str(model))
    key_modified_params = {f"model__{key}": val for key, val in param.items()}
    output = imb_modeling.imb_pipe_fit(model,key_modified_params,X=svd8.drop(columns='Bankrupt'),y=svd8['Bankrupt'])
    res.append(output)
    end = time.time()
    print(output)
    print('Time: ' + str(end-start))

Training: SVC()


KeyboardInterrupt: 