# Pharmacy claim approval(model selection)
Jeeuhn Kim

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score,recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [19]:
#loading data
claim = pd.read_csv('dim_claims.csv')
claim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1335576 entries, 0 to 1335575
Data columns (total 5 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   dim_claim_id             1335576 non-null  int64  
 1   bin                      1335576 non-null  int64  
 2   drug                     1335576 non-null  object 
 3   reject_code              555951 non-null   float64
 4   pharmacy_claim_approved  1335576 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 50.9+ MB


In [20]:
# change bin numbers to letters to simplify notation

claim.loc[claim.bin ==  417380, 'bin'] = 'x'
claim.loc[claim.bin ==  417614, 'bin'] = 'y'
claim.loc[claim.bin ==  417740, 'bin'] = 'z'
claim.loc[claim.bin ==  999001, 'bin'] = 'w'

In [21]:
# one hot encoding of categorical variable 'drug'
for i in pd.get_dummies(claim.drug).columns :
    claim.loc[:,i] = pd.get_dummies(claim.drug).loc[:,i].copy()

In [22]:
# one hot encoding of categorical variable 'bin'
for i in pd.get_dummies(claim.bin).columns :
    claim.loc[:,i] = pd.get_dummies(claim.bin).loc[:, i].copy()

In [23]:
# add interaction terms
for i in pd.get_dummies(claim.drug).columns :
    for j in pd.get_dummies(claim.bin) : 
        claim.loc[:,i+str(j)] = pd.get_dummies(claim.drug).loc[:,i].copy() * pd.get_dummies(claim.bin).loc[:,j].copy()


In [24]:
claim.columns

Index(['dim_claim_id', 'bin', 'drug', 'reject_code', 'pharmacy_claim_approved',
       'A', 'B', 'C', 'w', 'x', 'y', 'z', 'Aw', 'Ax', 'Ay', 'Az', 'Bw', 'Bx',
       'By', 'Bz', 'Cw', 'Cx', 'Cy', 'Cz'],
      dtype='object')

In [25]:
# make a new dataframe that only involves one hot encoded features
X = claim.iloc[:, 5:12].copy()
# make a new dataframe that only involves target variable
y = claim.iloc[:, 4].copy()
# make a new dataframe that only one hot encoded features and interaction terms
X_inter = claim.iloc[:, 5:].copy()

In [26]:
# split data into training set and test set with test set size =.2 stratified by the target classes
X_train, X_test, X_inter_train, X_inter_test, y_train, y_test = train_test_split(X.copy(),X_inter.copy(), y.copy(),
                                                                                 shuffle = True,
                                                                                 random_state = 623,
                                                                                 test_size = .2,
                                                                                 stratify = claim.pharmacy_claim_approved)

In [27]:
n = np.empty(9)
n = np.nan
cv = {'accuracy score': n}
# make a dataframe that stores KPIs
kpis = pd.DataFrame(data=cv, 
                       index=['Base','Log', 'LDA','SVM','DT','RF','ET','Ada','XGB'])
# make a dataframe that stores tuned hyperparameters
paras = pd.DataFrame(data=cv, 
                       index=['Base','Log', 'LDA','SVM','DT','RF','ET','Ada','XGB'])

In [28]:
# make kfold object
kfold = StratifiedKFold(n_splits = 5,
                        shuffle = True,
                        random_state = 623)

# Baseline model

In [29]:
# the baseline model predicts that all pharmacy claims will be approved
acses = np.zeros(5)
pres = np.zeros(5)
tprs = np.zeros(5)
fprs = np.zeros(5)
f1s = np.zeros(5)
aucs = np.zeros(5)
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    
    pred = np.ones((len(y_ho)))
    acses[i] = accuracy_score(y_ho.values, pred)
    conf_mat = confusion_matrix(y_ho, pred)
    pres[i] = precision_score(y_ho, pred)
    tprs[i] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
    fprs[i] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
    f1s[i] = f1_score(y_ho, pred)
    aucs[i] = roc_auc_score(y_ho, pred)
    i = i+1

In [30]:
# storing KPIs
kpis.loc['Base', 'accuracy_score'] = round(acses.mean(),3)
kpis.loc['Base', 'precision'] = round(pres.mean(),3)
kpis.loc['Base', 'true_positive_rate'] = round(tprs.mean(),3)
kpis.loc['Base', 'false_positive_rate'] = round(fprs.mean(),3)
kpis.loc['Base', 'f1'] = round(f1s.mean(),3)
kpis.loc['Base', 'roc_auc'] = round(aucs.mean(),3)

# Logistic regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
# cross validation for various regularization constants and
# various penalty norms
Cs =[.001, .1, 1, 10, 100]
penalties = ['none', 'l2']

acses = np.zeros((5, len(Cs),len(penalties)))
pres = np.zeros((5, len(Cs),len(penalties)))
tprs = np.zeros((5, len(Cs),len(penalties)))
fprs = np.zeros((5, len(Cs),len(penalties)))
f1s = np.zeros((5, len(Cs),len(penalties)))
aucs = np.zeros((5, len(Cs),len(penalties)))


i=0
for train_index, test_index in kfold.split(X_inter_train, y_train) :
    j=0 
    for C in Cs :
        k=0
        for penalty in penalties :
            X_tt = X_inter_train.iloc[train_index, :].copy()
            X_ho = X_inter_train.iloc[test_index, :].copy()
            y_tt = y_train.iloc[train_index].copy()
            y_ho = y_train.iloc[test_index].copy()
            
            log_reg = LogisticRegression(penalty = penalty, C=C, max_iter = 100)
            log_reg.fit(X_tt.values, y_tt.values)
            pred = log_reg.predict(X_ho.values)
            pred_proba = log_reg.predict_proba(X_ho.values)[:, 1]
            acses[i,j,k] = accuracy_score(y_ho.values, pred)
            conf_mat = confusion_matrix(y_ho, pred)
            pres[i,j,k] = precision_score(y_ho, pred)
            tprs[i,j,k] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
            fprs[i,j,k] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
            f1s[i,j,k] = f1_score(y_ho, pred)
            aucs[i,j,k] = roc_auc_score(y_ho, pred_proba)
            k=k+1
        j=j+1      
    i=i+1        

In [33]:
# accuracy_score
cv = np.mean(acses, axis = 0)
cv_max = np.max(np.max(cv, axis = 0),axis = 0)
penalty_index = np.argwhere(cv == cv_max)[0][0]
C_index = np.argwhere(cv == cv_max)[0][1]
kpis.loc['Log', 'accuracy_score'] = round(cv_max,3)
paras.loc['Log', 'accuracy_score'] = 'penalty='+ penalties[penalty_index] +', C='+ str(Cs[C_index])





# precision
cv = np.mean(pres, axis = 0)
cv_max = np.max(np.max(cv, axis = 0),axis = 0)
penalty_index = np.argwhere(cv == cv_max)[0][0]
C_index = np.argwhere(cv == cv_max)[0][1]
kpis.loc['Log', 'precision'] = round(cv_max,3)
paras.loc['Log', 'precision'] = 'penalty='+ penalties[penalty_index] +', C='+ str(Cs[C_index])





# true_positive_rate
cv = np.mean(tprs, axis = 0)
cv_max = np.max(np.max(cv, axis = 0),axis = 0)
penalty_index = np.argwhere(cv == cv_max)[0][0]
C_index = np.argwhere(cv == cv_max)[0][1]
kpis.loc['Log', 'true_positive_rate'] = round(cv_max,3)
paras.loc['Log', 'true_positive_rate'] = 'penalty='+ penalties[penalty_index] +', C='+ str(Cs[C_index])







# false_positive_rate
cv = np.mean(fprs, axis = 0)
cv_max = np.max(np.max(cv, axis = 0),axis = 0)
penalty_index = np.argwhere(cv == cv_max)[0][0]
C_index = np.argwhere(cv == cv_max)[0][1]
kpis.loc['Log', 'false_positive_rate'] = round(cv_max,3)
paras.loc['Log', 'false_positive_rate'] = 'penalty='+ penalties[penalty_index] +', C='+ str(Cs[C_index])








# f1_score
cv = np.mean(f1s, axis = 0)
cv_max = np.max(np.max(cv, axis = 0),axis = 0)
penalty_index = np.argwhere(cv == cv_max)[0][0]
C_index = np.argwhere(cv == cv_max)[0][1]
kpis.loc['Log', 'f1'] = round(cv_max,3)
paras.loc['Log', 'f1'] = 'penalty='+ penalties[penalty_index] +', C='+ str(Cs[C_index])







# roc_auc
cv = np.mean(aucs, axis = 0)
cv_max = np.max(np.max(cv, axis = 0),axis = 0)
penalty_index = np.argwhere(cv == cv_max)[0][0]
C_index = np.argwhere(cv == cv_max)[0][1]
kpis.loc['Log', 'roc_auc'] = round(cv_max,3)
paras.loc['Log', 'roc_auc'] = 'penalty='+ penalties[penalty_index] +', C='+ str(Cs[C_index])



# Linear discriminant analysis

In [34]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [35]:
# cross validation process
acses = np.zeros((5,1))
pres = np.zeros((5,1))
tprs = np.zeros((5,1))
fprs = np.zeros((5,1))
f1s = np.zeros((5,1))
aucs = np.zeros((5,1))
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()       
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_tt.values, y_tt.values)
    pred = lda.predict(X_ho.values)
    pred_proba = lda.predict_proba(X_ho.values)[:, 1]
    acses[i,0] = accuracy_score(y_ho.values, pred)
    conf_mat = confusion_matrix(y_ho, pred)
    pres[i] = precision_score(y_ho, pred)
    tprs[i] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
    fprs[i] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
    f1s[i] = f1_score(y_ho, pred)
    aucs = roc_auc_score(y_ho, pred_proba)
    i=i+1

In [36]:
# store KPIs
kpis.loc['LDA', 'accuracy_score'] = round(acses.mean(),3)
kpis.loc['LDA', 'precision'] = round(pres.mean())
kpis.loc['LDA','true_positive_rate'] = round(tprs.mean(),3)
kpis.loc['LDA', 'false_positive_rate'] = round(fprs.mean(),3)
kpis.loc['LDA', 'f1'] = round(f1s.mean(), 3)
kpis.loc['LDA', 'roc_auc'] = round(aucs.mean(), 3)

# Support vector machine

In [37]:
from sklearn.svm import SVC

In [38]:
# cross validation for various margins in support vector machine
margins = [.1, 1, 10, 100]
acses = np.zeros((5,len(margins)))
pres = np.zeros((5,len(margins)))
tprs = np.zeros((5,len(margins)))
fprs = np.zeros((5,len(margins)))
f1s = np.zeros((5,len(margins)))
aucs = np.zeros((5,len(margins)))
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    j=0
    for margin in margins : 
        svc = SVC(C = margin, max_iter = 10, probability = True)
        svc.fit(X_tt.values, y_tt.values)
        pred_proba = svc.decision_function(X_ho)
        pred_proba = (pred_proba - pred_proba.min()) / (pred_proba.max() - pred_proba.min())
        acses[i,j] = accuracy_score(y_ho.values, 1*(pred_proba >= .5))
        conf_mat = confusion_matrix(y_ho, 1*(pred_proba >= .5))
        pres[i,j] = precision_score(y_ho, 1*(pred_proba >= .5))
        tprs[i,j] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
        fprs[i,j] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
        f1s[i,j] = f1_score(y_ho, 1*(pred_proba >= .5))
        aucs[i,j] = roc_auc_score(y_ho, pred_proba)
        j=j+1
    i=i+1

In [39]:
# accuracy_score
cv = np.mean(acses, axis = 0)
max_cv = np.max(cv, axis = 0)
margin_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['SVM', 'accuracy_score'] = round(max_cv,3)
paras.loc['SVM', 'accuracy_score'] = 'margin='+ str(margins[margin_index])





# precision
cv = np.mean(pres, axis = 0)
max_cv = np.max(cv, axis = 0)
margin_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['SVM', 'precision'] = round(max_cv,3)
paras.loc['SVM', 'precision'] = 'margin='+ str(margins[margin_index])






# true_positive_rate
cv = np.mean(tprs, axis = 0)
max_cv = np.max(cv, axis = 0)
margin_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['SVM', 'true_positive_rate'] = round(max_cv,3)
paras.loc['SVM', 'true_positive_rate'] = 'margin='+ str(margins[margin_index])





# false_positive_rate
cv = np.mean(fprs, axis = 0)
max_cv = np.max(cv, axis = 0)
margin_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['SVM', 'false_positive_rate'] = round(max_cv,3)
paras.loc['SVM', 'false_positive_rate'] = 'margin='+ str(margins[margin_index])






# f1
cv = np.mean(f1s, axis = 0)
max_cv = np.max(cv, axis = 0)
margin_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['SVM', 'f1'] = round(max_cv,3)
paras.loc['SVM', 'f1'] = 'margin='+ str(margins[margin_index])





# roc_auc
cv = np.mean(aucs, axis = 0)
max_cv = np.max(cv, axis = 0)
margin_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['SVM', 'roc_auc'] = round(max_cv,3)
paras.loc['SVM', 'roc_auc'] = 'margin='+ str(margins[margin_index])

# Decision tree

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
# cross validation for various max_depths 
max_depths = np.linspace(1, 32, 32, endpoint=True)
acses = np.zeros((5,len(max_depths)))
pres = np.zeros((5,len(max_depths)))
tprs = np.zeros((5,len(max_depths)))
fprs = np.zeros((5,len(max_depths)))
f1s = np.zeros((5,len(max_depths)))
aucs = np.zeros((5,len(max_depths)))
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    j=0
    for max_depth in max_depths :
        dt = DecisionTreeClassifier(max_depth = max_depth)
        dt.fit(X_tt.values, y_tt.values)
        pred = dt.predict(X_ho.values)
        pred_proba = dt.predict_proba(X_ho.values)[:,1]
        acses[i,j] = accuracy_score(y_ho.values, pred)
        conf_mat = confusion_matrix(y_ho, pred)
        pres[i,j] = precision_score(y_ho, pred)
        tprs[i,j] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
        fprs[i,j] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
        f1s[i,j] = f1_score(y_ho, pred)
        aucs[i,j] = roc_auc_score(y_ho, pred_proba)
        j=j+1
    i=i+1

In [42]:
# accuracy_score
cv = np.mean(acses, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['DT', 'accuracy_score'] = round(max_cv,3)
paras.loc['DT', 'accuracy_score'] = 'max_depth='+ str(max_depths[depth_index])





# precision
cv = np.mean(pres, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['DT', 'precision'] = round(max_cv,3)
paras.loc['DT', 'precision'] = 'max_depth='+ str(max_depths[depth_index])




# true_positive_rate
cv = np.mean(tprs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['DT', 'true_positive_rate'] = round(max_cv,3)
paras.loc['DT', 'true_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])





# false_positive_rate
cv = np.mean(fprs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['DT', 'false_positive_rate'] = round(max_cv,3)
paras.loc['DT', 'false_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])





# f1
cv = np.mean(f1s, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['DT', 'f1'] = round(max_cv,3)
paras.loc['DT', 'f1'] = 'max_depth='+ str(max_depths[depth_index])





# roc_auc
cv = np.mean(aucs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['DT', 'roc_auc'] = round(max_cv,3)
paras.loc['DT', 'roc_auc'] = 'max_depth='+ str(max_depths[depth_index])

# Random forest

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
# cross validation for various max_depths
max_depths = [3 ,4, 5, 6]
acses = np.zeros((5,len(max_depths)))
pres = np.zeros((5,len(max_depths)))
tprs = np.zeros((5,len(max_depths)))
fprs = np.zeros((5,len(max_depths)))
f1s = np.zeros((5,len(max_depths)))
aucs = np.zeros((5,len(max_depths)))
drug_score = np.zeros((5,len(max_depths)))
bin_score = np.zeros((5,len(max_depths)))
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    j=0
    for max_depth in max_depths :
        rf = RandomForestClassifier(n_estimators = 100,
                                    max_depth = max_depth,
                                    random_state = 623)
        rf.fit(X_tt.values,y_tt.values)
        pred = rf.predict(X_ho.values)
        pred_proba = rf.predict_proba(X_ho.values)[:,1]
        acses[i,j] = accuracy_score(y_ho.values, pred)
        conf_mat = confusion_matrix(y_ho, pred)
        pres[i,j] = precision_score(y_ho, pred)
        tprs[i,j] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
        fprs[i,j] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
        f1s[i,j] = f1_score(y_ho.values, pred)
        aucs[i,j] = roc_auc_score(y_ho.values, pred_proba)
        drug_score[i,j] = rf.feature_importances_[:3].sum()
        bin_score[i,j] = rf.feature_importances_[3:].sum()
        j=j+1
    i=i+1        

In [45]:
# random forest feature importances
for i in range(len(max_depths)) :
    score_df = pd.DataFrame({'feature':
                             ['drug', 'bin'],
                             'importance_score': 
                             [np.mean(drug_score,axis = 0)[i], np.mean(bin_score,axis=0)[i]]})
    score_df.sort_values('importance_score',ascending=False)
    print('max_depth='+ str(max_depths[i]))
    print(score_df)

max_depth=3
  feature  importance_score
0    drug          0.237752
1     bin          0.762248
max_depth=4
  feature  importance_score
0    drug          0.259754
1     bin          0.740246
max_depth=5
  feature  importance_score
0    drug          0.275184
1     bin          0.724816
max_depth=6
  feature  importance_score
0    drug          0.275184
1     bin          0.724816


In [46]:
# accuracy_score
cv = np.mean(acses, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['RF', 'accuracy_score'] = round(max_cv,3)
paras.loc['RF', 'accuracy_score'] = 'max_depth='+ str(max_depths[depth_index])





# precision
cv = np.mean(pres, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['RF', 'precision'] = round(max_cv,3)
paras.loc['RF', 'precision'] = 'max_depth='+ str(max_depths[depth_index])






# true_positive_rate
cv = np.mean(tprs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['RF', 'true_positive_rate'] = round(max_cv,3)
paras.loc['RF', 'true_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])





# false_positive_rate
cv = np.mean(fprs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['RF', 'false_positive_rate'] = round(max_cv,3)
paras.loc['RF', 'false_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])




# f1
cv = np.mean(f1s, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['RF', 'f1'] = round(max_cv,3)
paras.loc['RF', 'f1'] = 'max_depth='+ str(max_depths[depth_index])





# roc_auc
cv = np.mean(aucs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['RF', 'roc_auc'] = round(max_cv,3)
paras.loc['RF', 'roc_auc'] = 'max_depth='+ str(max_depths[depth_index])

# ExtraTrees

In [47]:
from sklearn.ensemble import ExtraTreesClassifier

In [48]:
# cross validation for various max_depths
max_depths = [3, 4, 5, 6]
acses = np.zeros((5,len(max_depths)))
pres = np.zeros((5,len(max_depths)))
tprs = np.zeros((5,len(max_depths)))
fprs = np.zeros((5,len(max_depths)))
f1s = np.zeros((5,len(max_depths)))
aucs = np.zeros((5,len(max_depths)))
i=0
for train_index, test_index in kfold.split(X_train, y_train) :
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    j=0
    for max_depth in max_depths :
        et = ExtraTreesClassifier(n_estimators = 100,
                                  max_depth = max_depth,
                                  random_state = 623)
        et.fit(X_tt.values, y_tt.values)
        pred = et.predict(X_ho.values)
        pred_proba = et.predict_proba(X_ho.values)[:,1]
        acses[i,j] = accuracy_score(y_ho.values, pred)
        conf_mat = confusion_matrix(y_ho, pred)
        pres[i,j] = precision_score(y_ho, pred)
        tprs[i,j] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
        fprs[i,j] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
        f1s[i,j] = f1_score(y_ho, pred)
        aucs[i,j] = roc_auc_score(y_ho, pred_proba)
        j=j+1
    i=i+1

In [49]:
# accuracy_score
cv = np.mean(acses, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['ET', 'accuracy_score'] = round(max_cv,3)
paras.loc['ET', 'accuracy_score'] = 'max_depth='+ str(max_depths[depth_index])





# precision
cv = np.mean(pres, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['ET', 'precision'] = round(max_cv,3)
paras.loc['ET', 'precision'] = 'max_depth='+ str(max_depths[depth_index])








# true_positive_rate
cv = np.mean(tprs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['ET', 'true_positive_rate'] = round(max_cv,3)
paras.loc['ET', 'true_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])







# false_positive_rate
cv = np.mean(fprs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['ET', 'false_positive_rate'] = round(max_cv,3)
paras.loc['ET', 'false_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])




# f1
cv = np.mean(f1s, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['ET', 'f1'] = round(max_cv,3)
paras.loc['ET', 'f1'] = 'max_depth='+ str(max_depths[depth_index])





# roc_auc
cv = np.mean(aucs, axis = 0)
max_cv = np.max(cv, axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['ET', 'roc_auc'] = round(max_cv,3)
paras.loc['ET', 'roc_auc'] = 'max_depth='+ str(max_depths[depth_index])

# AdaBoost

In [50]:
from sklearn.ensemble import AdaBoostClassifier

In [51]:
# cross validation for various max_depths
max_depths = [3, 4, 5]
acses = np.zeros((5,len(max_depths)))
pres = np.zeros((5,len(max_depths)))
tprs = np.zeros((5,len(max_depths)))
fprs = np.zeros((5,len(max_depths)))
f1s = np.zeros((5,len(max_depths)))
aucs = np.zeros((5,len(max_depths)))
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    j=0
    for max_depth in max_depths :
        ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth = max_depth),
                                 n_estimators = 100,
                                 random_state = 623)
        ada.fit(X_tt.values, y_tt.values)
        pred = ada.predict(X_ho.values)
        pred_proba = ada.predict_proba(X_ho.values)[:,1]
        acses[i,j] = accuracy_score(y_ho.values, pred)
        conf_mat = confusion_matrix(y_ho, pred)
        pres[i,j] = precision_score(y_ho, pred)
        tprs[i,j] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
        fprs[i,j] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
        f1s[i,j] = f1_score(y_ho, pred)
        aucs[i,j] = roc_auc_score(y_ho, pred_proba)
        j=j+1
    i=i+1

In [52]:
# accuracy_score
cv = np.mean(acses, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['Ada', 'accuracy_score'] = round(max_cv,3)
paras.loc['Ada', 'accuracy_score'] = 'max_depth='+ str(max_depths[depth_index]) 




# precision
cv = np.mean(pres, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['Ada', 'precision'] = round(max_cv,3)
paras.loc['Ada', 'precision'] = 'max_depth='+ str(max_depths[depth_index])




# true_positive_rate
cv = np.mean(tprs, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['Ada', 'true_positive_rate'] = round(max_cv,3)
paras.loc['Ada', 'true_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])


                                                           


# false_positive_rate
cv = np.mean(fprs, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['Ada', 'false_positive_rate'] = round(max_cv,3)
paras.loc['Ada', 'false_positive_rate'] = 'max_depth='+ str(max_depths[depth_index])





# f1
cv = np.mean(f1s, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['Ada', 'f1'] = round(max_cv,3)
paras.loc['Ada', 'f1'] = 'max_depth='+ str(max_depths[depth_index]) 




# roc_auc
cv = np.mean(aucs, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
depth_index = np.argwhere(cv == max_cv)[0][0]
kpis.loc['Ada', 'roc_auc'] = round(max_cv,3)
paras.loc['Ada', 'roc_auc'] = 'max_depth='+ str(max_depths[depth_index])

# XGBoost

In [53]:
from xgboost import XGBClassifier

In [54]:
# cross validation for various learning_rates and max_depths
learning_rates = [0.01, 0.1, 1, 10, 100]
max_depths = [3, 4, 5]
acses = np.zeros((5,len(learning_rates), len(max_depths)))
pres = np.zeros((5,len(learning_rates), len(max_depths)))
tprs = np.zeros((5,len(learning_rates), len(max_depths)))
fprs = np.zeros((5,len(learning_rates), len(max_depths)))
f1s = np.zeros((5,len(learning_rates), len(max_depths)))
aucs = np.zeros((5,len(learning_rates), len(max_depths)))
i=0
for train_index, test_index in kfold.split(X_train, y_train) : 
    X_tt = X_train.iloc[train_index, :].copy()
    X_ho = X_train.iloc[test_index, :].copy()
    y_tt = y_train.iloc[train_index].copy()
    y_ho = y_train.iloc[test_index].copy()
    j=0
    for learning_rate in learning_rates :
        k=0
        for max_depth in max_depths :
            xgb = XGBClassifier(learning_rate = learning_rate,
                                    max_depth = max_depth,
                                    use_label_encoder=False,
                                    n_estimators = 100,
                                    random_state = 623,
                                    eval_metric = 'logloss')
            xgb.fit(X_tt.values, y_tt.values,
                    early_stopping_rounds = 3,
                    eval_set = [(X_ho.values, y_ho.values)])
            pred = xgb.predict(X_ho.values)
            pred_proba = xgb.predict_proba(X_ho.values)[:,1]
            acses[i,j,k] = accuracy_score(y_ho.values, pred)
            conf_mat = confusion_matrix(y_ho, pred)
            pres[i,j,k] = precision_score(y_ho, pred)
            tprs[i,j,k] = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
            fprs[i,j,k] = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
            f1s[i,j,k] = f1_score(y_ho, pred)
            aucs[i,j,k] = roc_auc_score(y_ho, pred_proba)
            k=k+1
        j=j+1
    i=i+1        

[0]	validation_0-logloss:0.68708
[1]	validation_0-logloss:0.68101
[2]	validation_0-logloss:0.67495
[3]	validation_0-logloss:0.66900
[4]	validation_0-logloss:0.66323
[5]	validation_0-logloss:0.65774
[6]	validation_0-logloss:0.65231
[7]	validation_0-logloss:0.64670
[8]	validation_0-logloss:0.64142
[9]	validation_0-logloss:0.63613
[10]	validation_0-logloss:0.63104
[11]	validation_0-logloss:0.62617
[12]	validation_0-logloss:0.62120
[13]	validation_0-logloss:0.61648
[14]	validation_0-logloss:0.61154
[15]	validation_0-logloss:0.60697
[16]	validation_0-logloss:0.60239
[17]	validation_0-logloss:0.59775
[18]	validation_0-logloss:0.59340
[19]	validation_0-logloss:0.58925
[20]	validation_0-logloss:0.58495
[21]	validation_0-logloss:0.58071
[22]	validation_0-logloss:0.57653
[23]	validation_0-logloss:0.57255
[24]	validation_0-logloss:0.56854
[25]	validation_0-logloss:0.56469
[26]	validation_0-logloss:0.56092
[27]	validation_0-logloss:0.55689
[28]	validation_0-logloss:0.55317
[29]	validation_0-loglos

[42]	validation_0-logloss:0.46444
[43]	validation_0-logloss:0.46086
[44]	validation_0-logloss:0.45734
[45]	validation_0-logloss:0.45394
[46]	validation_0-logloss:0.45051
[47]	validation_0-logloss:0.44714
[48]	validation_0-logloss:0.44392
[49]	validation_0-logloss:0.44064
[50]	validation_0-logloss:0.43748
[51]	validation_0-logloss:0.43429
[52]	validation_0-logloss:0.43110
[53]	validation_0-logloss:0.42815
[54]	validation_0-logloss:0.42515
[55]	validation_0-logloss:0.42220
[56]	validation_0-logloss:0.41923
[57]	validation_0-logloss:0.41641
[58]	validation_0-logloss:0.41358
[59]	validation_0-logloss:0.41074
[60]	validation_0-logloss:0.40806
[61]	validation_0-logloss:0.40528
[62]	validation_0-logloss:0.40258
[63]	validation_0-logloss:0.40002
[64]	validation_0-logloss:0.39738
[65]	validation_0-logloss:0.39484
[66]	validation_0-logloss:0.39238
[67]	validation_0-logloss:0.38987
[68]	validation_0-logloss:0.38744
[69]	validation_0-logloss:0.38504
[70]	validation_0-logloss:0.38267
[71]	validatio

[84]	validation_0-logloss:0.21186
[85]	validation_0-logloss:0.21177
[86]	validation_0-logloss:0.21167
[87]	validation_0-logloss:0.21161
[88]	validation_0-logloss:0.21155
[89]	validation_0-logloss:0.21147
[90]	validation_0-logloss:0.21143
[91]	validation_0-logloss:0.21138
[92]	validation_0-logloss:0.21136
[93]	validation_0-logloss:0.21131
[94]	validation_0-logloss:0.21130
[95]	validation_0-logloss:0.21125
[96]	validation_0-logloss:0.21125
[97]	validation_0-logloss:0.21122
[98]	validation_0-logloss:0.21121
[99]	validation_0-logloss:0.21117
[0]	validation_0-logloss:0.62021
[1]	validation_0-logloss:0.56071
[2]	validation_0-logloss:0.51136
[3]	validation_0-logloss:0.46990
[4]	validation_0-logloss:0.43506
[5]	validation_0-logloss:0.40530
[6]	validation_0-logloss:0.37982
[7]	validation_0-logloss:0.35807
[8]	validation_0-logloss:0.33918
[9]	validation_0-logloss:0.32298
[10]	validation_0-logloss:0.30889
[11]	validation_0-logloss:0.29668
[12]	validation_0-logloss:0.28608
[13]	validation_0-loglos

[56]	validation_0-logloss:0.47019
[57]	validation_0-logloss:0.46779
[58]	validation_0-logloss:0.46558
[59]	validation_0-logloss:0.46319
[60]	validation_0-logloss:0.46095
[61]	validation_0-logloss:0.45882
[62]	validation_0-logloss:0.45669
[63]	validation_0-logloss:0.45458
[64]	validation_0-logloss:0.45190
[65]	validation_0-logloss:0.44915
[66]	validation_0-logloss:0.44712
[67]	validation_0-logloss:0.44455
[68]	validation_0-logloss:0.44203
[69]	validation_0-logloss:0.44011
[70]	validation_0-logloss:0.43763
[71]	validation_0-logloss:0.43524
[72]	validation_0-logloss:0.43329
[73]	validation_0-logloss:0.43085
[74]	validation_0-logloss:0.42858
[75]	validation_0-logloss:0.42685
[76]	validation_0-logloss:0.42448
[77]	validation_0-logloss:0.42223
[78]	validation_0-logloss:0.42055
[79]	validation_0-logloss:0.41833
[80]	validation_0-logloss:0.41626
[81]	validation_0-logloss:0.41451
[82]	validation_0-logloss:0.41245
[83]	validation_0-logloss:0.41029
[84]	validation_0-logloss:0.40873
[85]	validatio

[98]	validation_0-logloss:0.32820
[99]	validation_0-logloss:0.32660
[0]	validation_0-logloss:0.63374
[1]	validation_0-logloss:0.58545
[2]	validation_0-logloss:0.54513
[3]	validation_0-logloss:0.51151
[4]	validation_0-logloss:0.48317
[5]	validation_0-logloss:0.45866
[6]	validation_0-logloss:0.43782
[7]	validation_0-logloss:0.41432
[8]	validation_0-logloss:0.39437
[9]	validation_0-logloss:0.38010
[10]	validation_0-logloss:0.36436
[11]	validation_0-logloss:0.35321
[12]	validation_0-logloss:0.34071
[13]	validation_0-logloss:0.32993
[14]	validation_0-logloss:0.32177
[15]	validation_0-logloss:0.31313
[16]	validation_0-logloss:0.30552
[17]	validation_0-logloss:0.29948
[18]	validation_0-logloss:0.29329
[19]	validation_0-logloss:0.28830
[20]	validation_0-logloss:0.28322
[21]	validation_0-logloss:0.27874
[22]	validation_0-logloss:0.27480
[23]	validation_0-logloss:0.27146
[24]	validation_0-logloss:0.26775
[25]	validation_0-logloss:0.26445
[26]	validation_0-logloss:0.26159
[27]	validation_0-loglos

[40]	validation_0-logloss:0.21251
[41]	validation_0-logloss:0.21226
[42]	validation_0-logloss:0.21198
[43]	validation_0-logloss:0.21171
[44]	validation_0-logloss:0.21153
[45]	validation_0-logloss:0.21134
[46]	validation_0-logloss:0.21116
[47]	validation_0-logloss:0.21099
[48]	validation_0-logloss:0.21087
[49]	validation_0-logloss:0.21074
[50]	validation_0-logloss:0.21064
[51]	validation_0-logloss:0.21050
[52]	validation_0-logloss:0.21045
[53]	validation_0-logloss:0.21035
[54]	validation_0-logloss:0.21025
[55]	validation_0-logloss:0.21022
[56]	validation_0-logloss:0.21014
[57]	validation_0-logloss:0.21008
[58]	validation_0-logloss:0.21002
[59]	validation_0-logloss:0.20998
[60]	validation_0-logloss:0.20995
[61]	validation_0-logloss:0.20993
[62]	validation_0-logloss:0.20992
[63]	validation_0-logloss:0.20985
[64]	validation_0-logloss:0.20982
[65]	validation_0-logloss:0.20977
[66]	validation_0-logloss:0.20975
[67]	validation_0-logloss:0.20974
[68]	validation_0-logloss:0.20975
[69]	validatio

[13]	validation_0-logloss:0.61517
[14]	validation_0-logloss:0.61017
[15]	validation_0-logloss:0.60550
[16]	validation_0-logloss:0.60087
[17]	validation_0-logloss:0.59609
[18]	validation_0-logloss:0.59167
[19]	validation_0-logloss:0.58747
[20]	validation_0-logloss:0.58308
[21]	validation_0-logloss:0.57878
[22]	validation_0-logloss:0.57453
[23]	validation_0-logloss:0.57049
[24]	validation_0-logloss:0.56639
[25]	validation_0-logloss:0.56245
[26]	validation_0-logloss:0.55863
[27]	validation_0-logloss:0.55456
[28]	validation_0-logloss:0.55079
[29]	validation_0-logloss:0.54715
[30]	validation_0-logloss:0.54346
[31]	validation_0-logloss:0.53983
[32]	validation_0-logloss:0.53631
[33]	validation_0-logloss:0.53302
[34]	validation_0-logloss:0.52955
[35]	validation_0-logloss:0.52616
[36]	validation_0-logloss:0.52290
[37]	validation_0-logloss:0.51964
[38]	validation_0-logloss:0.51643
[39]	validation_0-logloss:0.51324
[40]	validation_0-logloss:0.51018
[41]	validation_0-logloss:0.50717
[42]	validatio

[55]	validation_0-logloss:0.21470
[56]	validation_0-logloss:0.21444
[57]	validation_0-logloss:0.21421
[58]	validation_0-logloss:0.21400
[59]	validation_0-logloss:0.21383
[60]	validation_0-logloss:0.21369
[61]	validation_0-logloss:0.21354
[62]	validation_0-logloss:0.21342
[63]	validation_0-logloss:0.21332
[64]	validation_0-logloss:0.21322
[65]	validation_0-logloss:0.21290
[66]	validation_0-logloss:0.21259
[67]	validation_0-logloss:0.21232
[68]	validation_0-logloss:0.21208
[69]	validation_0-logloss:0.21188
[70]	validation_0-logloss:0.21167
[71]	validation_0-logloss:0.21152
[72]	validation_0-logloss:0.21136
[73]	validation_0-logloss:0.21122
[74]	validation_0-logloss:0.21111
[75]	validation_0-logloss:0.21100
[76]	validation_0-logloss:0.21088
[77]	validation_0-logloss:0.21079
[78]	validation_0-logloss:0.21073
[79]	validation_0-logloss:0.21065
[80]	validation_0-logloss:0.21060
[81]	validation_0-logloss:0.21054
[82]	validation_0-logloss:0.21050
[83]	validation_0-logloss:0.21044
[84]	validatio

[10]	validation_0-logloss:0.20982
[11]	validation_0-logloss:0.20970
[12]	validation_0-logloss:0.20972
[13]	validation_0-logloss:0.20966
[14]	validation_0-logloss:0.20965
[15]	validation_0-logloss:0.20964
[16]	validation_0-logloss:0.20964
[17]	validation_0-logloss:0.20964
[18]	validation_0-logloss:0.20964
[19]	validation_0-logloss:0.20964
[20]	validation_0-logloss:0.20964
[0]	validation_0-logloss:0.34194
[1]	validation_0-logloss:0.30958
[2]	validation_0-logloss:0.30208
[3]	validation_0-logloss:0.29954
[4]	validation_0-logloss:0.29862
[5]	validation_0-logloss:0.29833
[6]	validation_0-logloss:0.29820
[7]	validation_0-logloss:0.29811
[8]	validation_0-logloss:0.29811
[9]	validation_0-logloss:0.29811
[10]	validation_0-logloss:0.29811
[11]	validation_0-logloss:0.29810
[12]	validation_0-logloss:0.22063
[13]	validation_0-logloss:0.21297
[14]	validation_0-logloss:0.21081
[15]	validation_0-logloss:0.21007
[16]	validation_0-logloss:0.20980
[17]	validation_0-logloss:0.20970
[18]	validation_0-loglos

[74]	validation_0-logloss:0.42879
[75]	validation_0-logloss:0.42705
[76]	validation_0-logloss:0.42530
[77]	validation_0-logloss:0.42352
[78]	validation_0-logloss:0.42185
[79]	validation_0-logloss:0.42009
[80]	validation_0-logloss:0.41844
[81]	validation_0-logloss:0.41675
[82]	validation_0-logloss:0.41511
[83]	validation_0-logloss:0.41349
[84]	validation_0-logloss:0.41203
[85]	validation_0-logloss:0.41040
[86]	validation_0-logloss:0.40884
[87]	validation_0-logloss:0.40736
[88]	validation_0-logloss:0.40585
[89]	validation_0-logloss:0.40433
[90]	validation_0-logloss:0.40301
[91]	validation_0-logloss:0.40154
[92]	validation_0-logloss:0.40006
[93]	validation_0-logloss:0.39875
[94]	validation_0-logloss:0.39729
[95]	validation_0-logloss:0.39599
[96]	validation_0-logloss:0.39471
[97]	validation_0-logloss:0.39342
[98]	validation_0-logloss:0.39213
[99]	validation_0-logloss:0.39086
[0]	validation_0-logloss:0.68569
[1]	validation_0-logloss:0.67819
[2]	validation_0-logloss:0.67078
[3]	validation_0-

[16]	validation_0-logloss:0.32330
[17]	validation_0-logloss:0.31148
[18]	validation_0-logloss:0.30162
[19]	validation_0-logloss:0.29320
[20]	validation_0-logloss:0.28602
[21]	validation_0-logloss:0.27996
[22]	validation_0-logloss:0.27467
[23]	validation_0-logloss:0.27004
[24]	validation_0-logloss:0.26606
[25]	validation_0-logloss:0.26258
[26]	validation_0-logloss:0.25954
[27]	validation_0-logloss:0.25692
[28]	validation_0-logloss:0.25453
[29]	validation_0-logloss:0.25250
[30]	validation_0-logloss:0.25070
[31]	validation_0-logloss:0.24907
[32]	validation_0-logloss:0.24771
[33]	validation_0-logloss:0.24644
[34]	validation_0-logloss:0.24529
[35]	validation_0-logloss:0.24435
[36]	validation_0-logloss:0.24350
[37]	validation_0-logloss:0.24270
[38]	validation_0-logloss:0.24201
[39]	validation_0-logloss:0.24137
[40]	validation_0-logloss:0.24084
[41]	validation_0-logloss:0.24035
[42]	validation_0-logloss:0.23988
[43]	validation_0-logloss:0.23635
[44]	validation_0-logloss:0.23438
[45]	validatio

[2]	validation_0-logloss:15.33648
[0]	validation_0-logloss:1.02822
[1]	validation_0-logloss:1.02822
[2]	validation_0-logloss:1.02822
[3]	validation_0-logloss:1.02822
[0]	validation_0-logloss:3.81230
[1]	validation_0-logloss:3.81230
[2]	validation_0-logloss:3.81230
[3]	validation_0-logloss:3.81230
[0]	validation_0-logloss:4.44915
[1]	validation_0-logloss:4.44915
[2]	validation_0-logloss:4.44915
[3]	validation_0-logloss:4.44915
[0]	validation_0-logloss:2.37603
[1]	validation_0-logloss:2.37603
[2]	validation_0-logloss:2.37603
[0]	validation_0-logloss:0.68709
[1]	validation_0-logloss:0.68103
[2]	validation_0-logloss:0.67498
[3]	validation_0-logloss:0.66904
[4]	validation_0-logloss:0.66329
[5]	validation_0-logloss:0.65780
[6]	validation_0-logloss:0.65238
[7]	validation_0-logloss:0.64678
[8]	validation_0-logloss:0.64150
[9]	validation_0-logloss:0.63626
[10]	validation_0-logloss:0.63118
[11]	validation_0-logloss:0.62632
[12]	validation_0-logloss:0.62134
[13]	validation_0-logloss:0.61662
[14]	

[27]	validation_0-logloss:0.52577
[28]	validation_0-logloss:0.52121
[29]	validation_0-logloss:0.51677
[30]	validation_0-logloss:0.51234
[31]	validation_0-logloss:0.50796
[32]	validation_0-logloss:0.50372
[33]	validation_0-logloss:0.49968
[34]	validation_0-logloss:0.49550
[35]	validation_0-logloss:0.49142
[36]	validation_0-logloss:0.48742
[37]	validation_0-logloss:0.48352
[38]	validation_0-logloss:0.47962
[39]	validation_0-logloss:0.47574
[40]	validation_0-logloss:0.47207
[41]	validation_0-logloss:0.46840
[42]	validation_0-logloss:0.46466
[43]	validation_0-logloss:0.46112
[44]	validation_0-logloss:0.45758
[45]	validation_0-logloss:0.45419
[46]	validation_0-logloss:0.45076
[47]	validation_0-logloss:0.44743
[48]	validation_0-logloss:0.44417
[49]	validation_0-logloss:0.44097
[50]	validation_0-logloss:0.43780
[51]	validation_0-logloss:0.43458
[52]	validation_0-logloss:0.43150
[53]	validation_0-logloss:0.42849
[54]	validation_0-logloss:0.42545
[55]	validation_0-logloss:0.42253
[56]	validatio

[77]	validation_0-logloss:0.21346
[78]	validation_0-logloss:0.21342
[79]	validation_0-logloss:0.21336
[80]	validation_0-logloss:0.21328
[81]	validation_0-logloss:0.21314
[82]	validation_0-logloss:0.21304
[83]	validation_0-logloss:0.21294
[84]	validation_0-logloss:0.21283
[85]	validation_0-logloss:0.21277
[86]	validation_0-logloss:0.21269
[87]	validation_0-logloss:0.21263
[88]	validation_0-logloss:0.21262
[89]	validation_0-logloss:0.21255
[90]	validation_0-logloss:0.21255
[91]	validation_0-logloss:0.21249
[92]	validation_0-logloss:0.21248
[93]	validation_0-logloss:0.21243
[94]	validation_0-logloss:0.21241
[95]	validation_0-logloss:0.21237
[96]	validation_0-logloss:0.21236
[97]	validation_0-logloss:0.21233
[98]	validation_0-logloss:0.21233
[99]	validation_0-logloss:0.21230
[0]	validation_0-logloss:0.62030
[1]	validation_0-logloss:0.56086
[2]	validation_0-logloss:0.51160
[3]	validation_0-logloss:0.47017
[4]	validation_0-logloss:0.43537
[5]	validation_0-logloss:0.40565
[6]	validation_0-log

In [55]:
# accuracy_score
cv = np.mean(acses, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
rate_index = np.argwhere(cv == max_cv)[0][0]
depth_index = np.argwhere(cv == max_cv)[0][1]
kpis.loc['XGB', 'accuracy_score'] = round(max_cv,3)
paras.loc['XGB', 'accuracy_score'] = 'learning_rate = ' + str(learning_rates[rate_index]) + 'max_depth = '+ str(max_depths[depth_index]) 





# precision
cv = np.mean(pres, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
rate_index = np.argwhere(cv == max_cv)[0][0]
depth_index = np.argwhere(cv == max_cv)[0][1]
kpis.loc['XGB', 'precision'] = round(max_cv,3)
paras.loc['XGB', 'precision'] = 'learning_rate = ' + str(learning_rates[rate_index]) + 'max_depth = '+ str(max_depths[depth_index])





# true_positive_rate
cv = np.mean(tprs, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
rate_index = np.argwhere(cv == max_cv)[0][0]
depth_index = np.argwhere(cv == max_cv)[0][1]
kpis.loc['XGB', 'true_positive_rate'] = round(max_cv,3)
paras.loc['XGB', 'true_positive_rate'] = 'learning_rate = ' + str(learning_rates[rate_index]) + 'max_depth = '+ str(max_depths[depth_index])



# false_positive_rate
cv = np.mean(fprs, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
rate_index = np.argwhere(cv == max_cv)[0][0]
depth_index = np.argwhere(cv == max_cv)[0][1]
kpis.loc['XGB', 'false_positive_rate'] = round(max_cv,3)
paras.loc['XGB', 'false_positive_rate'] = 'learning_rate = ' + str(learning_rates[rate_index]) + 'max_depth = '+ str(max_depths[depth_index])



# f1
cv = np.mean(f1s, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
rate_index = np.argwhere(cv == max_cv)[0][0]
depth_index = np.argwhere(cv == max_cv)[0][1]
kpis.loc['XGB', 'f1'] = round(max_cv,3)
paras.loc['XGB', 'f1'] = 'learning_rate = ' + str(learning_rates[rate_index]) + 'max_depth = '+ str(max_depths[depth_index])



# roc_auc
cv = np.mean(aucs, axis = 0)
max_cv = np.max(np.max(cv, axis = 0),axis = 0)
rate_index = np.argwhere(cv == max_cv)[0][0]
depth_index = np.argwhere(cv == max_cv)[0][1]
kpis.loc['XGB', 'roc_auc'] = round(max_cv,3)
paras.loc['XGB', 'roc_auc'] = 'learning_rate = ' + str(learning_rates[rate_index]) + 'max_depth = '+ str(max_depths[depth_index])

In [56]:
kpis

Unnamed: 0,accuracy score,accuracy_score,precision,true_positive_rate,false_positive_rate,f1,roc_auc
Base,,0.584,0.584,1.0,1.0,0.737,0.5
Log,,0.935,0.9,1.0,0.155,0.948,0.922
LDA,,0.762,1.0,0.739,0.206,0.784,0.852
SVM,,0.624,0.665,0.858,0.704,0.717,0.623
DT,,0.935,0.9,1.0,0.221,0.948,0.922
RF,,0.935,0.9,1.0,0.155,0.948,0.922
ET,,0.935,0.9,1.0,0.155,0.948,0.922
Ada,,0.935,0.9,1.0,0.155,0.948,0.922
XGB,,0.935,0.9,1.0,0.221,0.948,0.922


In [57]:
paras

Unnamed: 0,accuracy score,accuracy_score,precision,true_positive_rate,false_positive_rate,f1,roc_auc
Base,,,,,,,
Log,,"penalty=none, C=0.001","penalty=none, C=0.001","penalty=none, C=0.001","penalty=none, C=0.001","penalty=none, C=0.001","penalty=none, C=0.1"
LDA,,,,,,,
SVM,,margin=10,margin=1,margin=10,margin=10,margin=10,margin=10
DT,,max_depth=5.0,max_depth=5.0,max_depth=5.0,max_depth=4.0,max_depth=5.0,max_depth=5.0
RF,,max_depth=3,max_depth=3,max_depth=3,max_depth=3,max_depth=3,max_depth=5
ET,,max_depth=3,max_depth=3,max_depth=3,max_depth=3,max_depth=3,max_depth=4
Ada,,max_depth=3,max_depth=3,max_depth=3,max_depth=3,max_depth=3,max_depth=3
XGB,,learning_rate = 0.01max_depth = 5,learning_rate = 0.01max_depth = 5,learning_rate = 0.01max_depth = 5,learning_rate = 0.01max_depth = 4,learning_rate = 0.01max_depth = 5,learning_rate = 10max_depth = 5


# Test set verification of the selected model(Decision Tree)

In [58]:
dt = DecisionTreeClassifier(max_depth = 5)
dt.fit(X_train.values, y_train.values)
pred = dt.predict(X_test.values)
pred_proba = dt.predict_proba(X_test.values)[:,1]
acs = accuracy_score(y_test.values, pred)
conf_mat = confusion_matrix(y_test, pred)
pre = precision_score(y_test, pred)
tpr = conf_mat[1,1]/(conf_mat[1,0] + conf_mat[1,1])
fpr = conf_mat[0,1]/(conf_mat[0,0] + conf_mat[0,1])
f1 = f1_score(y_test, pred)
auc = roc_auc_score(y_test, pred_proba)

In [59]:
print('accuracy_score = ' + str(acs))
print('precision = ' + str(pre))
print('true_positive_rate = ' + str(tpr))
print('false_positive_rate = ' + str(fpr))
print('f1 = ' + str(f1))
print('roc_auc = ' + str(auc))

accuracy_score = 0.9352041809550907
precision = 0.9000883203546668
true_positive_rate = 1.0
false_positive_rate = 0.15566008040219081
f1 = 0.9474173497226256
roc_auc = 0.9223688747588463
