## Aim: selecting threshold for binary classification

In [113]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

In [97]:
x,y = make_classification(n_samples=4000,weights=[1,1],random_state=50) 

In [98]:
# 4000 Records and 20 features
x.shape

(4000, 20)

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

### Random Forest

In [119]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(x_train,y_train)
y_train_pred = rf_model.predict_proba(x_train)
print('ROc AUC for y_train_pred : {}'.format(roc_auc_score(y_train,y_train_pred[:,1])))  #y_train_pred[:1] ==> this is the probability of getting 1's
 
y_test_pred = rf_model.predict_proba(x_test)
print('ROc AUC for y_test_pred : {}'.format(roc_auc_score(y_test,y_test_pred[:,1])))    #y_test_pred[:1] ==> this is the probability of getting 1's

print('Accuracy (when the threshold is default i.e 0.5)  :{}'.format(accuracy_score(y_test,rf_model.predict(x_test))))

ROc AUC for y_train_pred : 1.0
ROc AUC for y_test_pred : 0.9859925378732106
Accuracy (when the threshold is default i.e 0.5)  :0.9541666666666667


## Adaboost

In [120]:
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier()
ab_model.fit(x_train,y_train) 
y_train_pred = ab_model.predict_proba(x_train) 
print('ROc AUC for y_train_pred : {}'.format(roc_auc_score(y_train,y_train_pred[:,1])))  #y_train_pred[:1] ==> this is the probability of getting 1's
 
y_test_pred = rf_model.predict_proba(x_test)
print('ROc AUC for y_test_pred : {}'.format(roc_auc_score(y_test,y_test_pred[:,1])))    #y_test_pred[:1] ==> this is the probability of getting 1's

print('Accuracy (when the threshold is default i.e 0.5)  :{}'.format(accuracy_score(y_test,ab_model.predict(x_test)))) 

ROc AUC for y_train_pred : 0.9947699369208625
ROc AUC for y_test_pred : 0.9859925378732106
Accuracy (when the threshold is default i.e 0.5)  :0.9466666666666667


## KNN

In [121]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(x_train,y_train)
y_train_pred = knn_model.predict_proba(x_train)
print('ROc AUC for y_train_pred : {}'.format(roc_auc_score(y_train,y_train_pred[:,1])))  #y_train_pred[:1] ==> this is the probability of getting 1's
 
y_test_pred = rf_model.predict_proba(x_test)
print('ROc AUC for y_test_pred : {}'.format(roc_auc_score(y_test,y_test_pred[:,1])))    #y_test_pred[:1] ==> this is the probability of getting 1's

print('Accuracy (when the threshold is default i.e 0.5)  :{}'.format(accuracy_score(y_test,knn_model.predict(x_test)))) 

ROc AUC for y_train_pred : 0.9836756008794242
ROc AUC for y_test_pred : 0.9859925378732106
Accuracy (when the threshold is default i.e 0.5)  :0.9066666666666666


## Logistic Regression

In [122]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
y_train_pred = lr_model.predict_proba(x_train)
print('ROc AUC for y_train_pred : {}'.format(roc_auc_score(y_train,y_train_pred[:,1])))  #y_train_pred[:1] ==> this is the probability of getting 1's
 
y_test_pred = rf_model.predict_proba(x_test)
print('ROc AUC for y_test_pred : {}'.format(roc_auc_score(y_test,y_test_pred[:,1])))    #y_test_pred[:1] ==> this is the probability of getting 1's

print('Accuracy (when the threshold is default i.e 0.5)  :{}'.format(accuracy_score(y_test,lr_model.predict(x_test)))) 

ROc AUC for y_train_pred : 0.979797124268911
ROc AUC for y_test_pred : 0.9859925378732106
Accuracy (when the threshold is default i.e 0.5)  :0.935


In [104]:

pred=[];
for model in [lr_model,knn_model,ab_model,rf_model]:
    p = model.predict_proba(x_test)[:,1]
    pred.append(pd.Series(p))
pred = pd.concat(pred,axis=1)
pred

Unnamed: 0,0,1,2,3
0,0.985523,1.0,0.530741,1.00
1,0.868646,0.8,0.523189,0.95
2,0.703473,0.6,0.514142,0.93
3,0.867040,1.0,0.502910,0.72
4,0.836601,0.4,0.516093,0.94
...,...,...,...,...
1195,0.994208,0.8,0.562660,0.98
1196,0.015433,0.4,0.485764,0.12
1197,0.999873,1.0,0.683534,1.00
1198,0.964842,1.0,0.531247,1.00


In [105]:
## final prediction
final_prediction = pred.mean(axis=1)
## the values are the probability of getting 1's
final_prediction

0       0.879066
1       0.785459
2       0.686904
3       0.772487
4       0.673173
          ...   
1195    0.834217
1196    0.255299
1197    0.920852
1198    0.874022
1199    0.870401
Length: 1200, dtype: float64

In [106]:
roc_auc_score(y_test,final_prediction)

0.9799833866276243

In [107]:
from sklearn.metrics import roc_curve
fpr,tpr,threshold = roc_curve(y_test,final_prediction)

In [108]:
threshold

array([1.92420316, 0.92420316, 0.88741003, 0.88740261, 0.88117969,
       0.88103503, 0.86687076, 0.86666785, 0.83079562, 0.83029137,
       0.81433971, 0.81319953, 0.78290946, 0.78272685, 0.78224334,
       0.78117324, 0.77591936, 0.77504498, 0.77272094, 0.77248739,
       0.65860593, 0.65591079, 0.633138  , 0.62436473, 0.60450801,
       0.60248453, 0.58937022, 0.58320043, 0.57304392, 0.56411826,
       0.55887324, 0.53237355, 0.53147436, 0.52735914, 0.52070428,
       0.51587223, 0.51349043, 0.51222678, 0.50906699, 0.50460331,
       0.50226396, 0.5012792 , 0.49894582, 0.4930814 , 0.49042768,
       0.48754382, 0.48677073, 0.48088175, 0.45581421, 0.45545156,
       0.45309671, 0.44964645, 0.44504651, 0.42342944, 0.41986178,
       0.40697501, 0.4022466 , 0.40101136, 0.39946191, 0.38871144,
       0.38702867, 0.37826913, 0.36468649, 0.36207051, 0.36078017,
       0.34966733, 0.34959264, 0.34510126, 0.3396566 , 0.33917871,
       0.33710808, 0.32874567, 0.3286048 , 0.30650398, 0.30554

In [109]:
from sklearn.metrics import accuracy_score
accuracy=[];
for thres in threshold:
    y_pred = np.where(final_prediction>thres ,1,0) 
    accuracy.append(accuracy_score(y_test,y_pred,normalize=True))

In [124]:
T = pd.concat([pd.Series(threshold),pd.Series(accuracy)],axis=1) 
T.columns=['Threshold','Accuracy']
T[T.Accuracy==T.Accuracy.max()]

Unnamed: 0,Threshold,Accuracy
29,0.564118,0.945833
30,0.558873,0.945833
34,0.520704,0.945833
35,0.515872,0.945833
49,0.455452,0.945833
50,0.453097,0.945833
