In [1]:
import os
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import confusion_matrix, make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier


random_state=7

In [2]:
df = pd.read_csv("../data/trainDRUG.csv", encoding='utf8')

In [3]:
X = df.drop(['再犯註記'], axis=1)
y = df['再犯註記'].values

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
print('Dimensions: \n x_train:{} \n x_test{} \n y_train{} \n y_test{}'.format(x_train.shape, x_test.shape, y_train.shape, y_test.shape))

Dimensions: 
 x_train:(52321, 15) 
 x_test(13081, 15) 
 y_train(52321,) 
 y_test(13081,)


In [5]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

In [6]:
scorers = {
    'Accuracy': 'accuracy', 
    'roc_auc': 'roc_auc', 
    'Sensitivity':'recall', 
    'precision':'precision',
    'tp': make_scorer(tp), 
    'tn': make_scorer(tn),
    'fp': make_scorer(fp), 
    'fn': make_scorer(fn)
} 

In [10]:
classifier_name = 'Easy Ensemble'

In [11]:
start_time = time.time()

In [12]:
clf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=3,criterion='entropy', class_weight='balanced', random_state=random_state)

In [13]:
scores = cross_validate(clf, X, y, scoring=scorers, cv=5)

In [14]:
print(time.time()-start_time)

351.0397081375122


In [15]:
Sensitivity = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fn'].mean()),3)*100   #TP/(TP+FN) also recall
Specificity = round(scores['test_tn'].mean() / (scores['test_tn'].mean() + scores['test_fp'].mean()),3)*100    #TN/(TN+FP)
PPV = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fp'].mean()),3)*100           #PPV = tp/(tp+fp) also precision
NPV = round(scores['test_tn'].mean() / (scores['test_fn'].mean() + scores['test_tn'].mean()),3)*100           #TN(FN+TN)

In [16]:
scores_Acc = scores['test_Accuracy']                                                                                                                                    
print(f"{classifier_name} Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2)) 

Easy Ensemble Acc: 0.83 (+/- 0.02)


In [17]:
scores_AUC = scores['test_roc_auc']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))

Easy Ensemble AUC: 0.75 (+/- 0.02)


In [18]:
scores_sensitivity = scores['test_Sensitivity']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Recall: %0.2f (+/- %0.2f)" % (scores_sensitivity.mean(), scores_sensitivity.std() * 2)) 

Easy Ensemble Recall: 0.28 (+/- 0.06)


In [19]:
scores_precision = scores['test_precision']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Precision: %0.2f (+/- %0.2f)" % (scores_precision.mean(), scores_precision.std() * 2)) 

Easy Ensemble Precision: 0.54 (+/- 0.11)


In [20]:
print(f"{classifier_name} Sensitivity = ", Sensitivity, "%")
print(f"{classifier_name} Specificity = ", Specificity, "%")
print(f"{classifier_name} PPV = ", PPV, "%")  
print(f"{classifier_name} NPV = ", NPV, "%")

Easy Ensemble Sensitivity =  28.1 %
Easy Ensemble Specificity =  94.69999999999999 %
Easy Ensemble PPV =  52.800000000000004 %
Easy Ensemble NPV =  86.1 %


In [21]:
st = time.time()
clf = EasyEnsembleClassifier(n_estimators=20)
scores = cross_validate(clf, X, y, scoring=scorers, cv=5)          

Sensitivity = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fn'].mean()),3)*100   #TP/(TP+FN) also recall
Specificity = round(scores['test_tn'].mean() / (scores['test_tn'].mean() + scores['test_fp'].mean()),3)*100    #TN/(TN+FP)
PPV = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fp'].mean()),3)*100           #PPV = tp/(tp+fp) also precision
NPV = round(scores['test_tn'].mean() / (scores['test_fn'].mean() + scores['test_tn'].mean()),3)*100           #TN(FN+TN)

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print(f"{classifier_name} Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC = scores['test_roc_auc']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))      
scores_sensitivity = scores['test_Sensitivity']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Recall: %0.2f (+/- %0.2f)" % (scores_sensitivity.mean(), scores_sensitivity.std() * 2)) 
scores_precision = scores['test_precision']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Precision: %0.2f (+/- %0.2f)" % (scores_precision.mean(), scores_precision.std() * 2))                          
print(f"{classifier_name} Sensitivity = ", Sensitivity, "%")
print(f"{classifier_name} Specificity = ", Specificity, "%")
print(f"{classifier_name} PPV = ", PPV, "%")  
print(f"{classifier_name} NPV = ", NPV, "%")

print("CV Runtime:", time.time()-st)

Easy Ensemble Acc: 0.67 (+/- 0.08)
Easy Ensemble AUC: 0.75 (+/- 0.02)
Easy Ensemble Recall: 0.70 (+/- 0.09)
Easy Ensemble Precision: 0.31 (+/- 0.05)
Easy Ensemble Sensitivity =  70.39999999999999 %
Easy Ensemble Specificity =  65.8 %
Easy Ensemble PPV =  30.5 %
Easy Ensemble NPV =  91.2 %
CV Runtime: 28.85393214225769
