In [18]:
# import dependencies

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

In [2]:
# read data
x_train = pd.read_csv("data/processed_data/x_train.csv")
x_val = pd.read_csv("data/processed_data/x_val.csv")
y_train = pd.read_csv("data/processed_data/y_train.csv")
y_val = pd.read_csv("data/processed_data/y_val.csv")
x_test = pd.read_csv("data/processed_data/x_test.csv")

Logistic regression on pre-processed data

In [3]:
# Logistic regression

# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()
lr_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
                   param_distributions={'C': array([1.000e-02, 1.020e+00, 2.030e+00, 3.040e+00, 4.050e+00, 5.060e+00,
       6.070e+00, 7.080e+00, 8.090e+00, 9.100e+00, 1.011e+01, 1.112e+01,
       1.213e+01, 1.314e+01, 1.415e+01, 1.516e+01, 1.617e+01, 1.718e+01,
       1.819e+01, 1.920e+01, 2.021e+01, 2.122e+01, 2.223e+01, 2.324e+01,
       2.425e+01, 2....
       7.273e+01, 7.374e+01, 7.475e+01, 7.576e+01, 7.677e+01, 7.778e+01,
       7.879e+01, 7.980e+01, 8.081e+01, 8.182e+01, 8.283e+01, 8.384e+01,
       8.485e+01, 8.586e+01, 8.687e+01, 8.788e+01, 8.889e+01, 8.990e+01,
       9.091e+01, 9.192e+01, 9.293e+01, 9.394e+01, 9.495e+01, 9.596e+01,
       9.697e+01, 9.798e+01, 9.899e+01, 1.000e+02]),
                                        'class_weight': ['balanced', None],
                                        'penalty': ['l1', 'l2']},
                   scoring='roc_auc', verbose=20)

In [4]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.6687016769740979
confusion_matrix: 
 [[9720 6207]
 [1344 2627]]
classification_report: 
               precision    recall  f1-score   support

           0       0.88      0.61      0.72     15927
           1       0.30      0.66      0.41      3971

    accuracy                           0.62     19898
   macro avg       0.59      0.64      0.57     19898
weighted avg       0.76      0.62      0.66     19898



In [5]:
y_train.value_counts()

target_numeric
0                 63433
1                 16159
dtype: int64

Balancing the train dataset by downsampling the majority class

In [6]:
# combine x and y train datasets
train_data = pd.concat([x_train, y_train], axis=1)

# Separate majority and minority classes
df_majority = train_data[train_data.target_numeric==0]
df_minority = train_data[train_data.target_numeric==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement 
                                 n_samples=16159,   # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
# df_downsampled.target_numeric.value_counts()

# Splitting df_downsampled into x and y
target ='target_numeric'
x_train = df_downsampled.drop([target],1)
y_train = df_downsampled[target]

Logistic regression on balanced train dataset

In [7]:
# Logistic regression on balanced dataset

# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# Model development

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()
lr_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0921s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1862s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jo

RandomizedSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
                   param_distributions={'C': array([1.000e-02, 1.020e+00, 2.030e+00, 3.040e+00, 4.050e+00, 5.060e+00,
       6.070e+00, 7.080e+00, 8.090e+00, 9.100e+00, 1.011e+01, 1.112e+01,
       1.213e+01, 1.314e+01, 1.415e+01, 1.516e+01, 1.617e+01, 1.718e+01,
       1.819e+01, 1.920e+01, 2.021e+01, 2.122e+01, 2.223e+01, 2.324e+01,
       2.425e+01, 2....
       7.273e+01, 7.374e+01, 7.475e+01, 7.576e+01, 7.677e+01, 7.778e+01,
       7.879e+01, 7.980e+01, 8.081e+01, 8.182e+01, 8.283e+01, 8.384e+01,
       8.485e+01, 8.586e+01, 8.687e+01, 8.788e+01, 8.889e+01, 8.990e+01,
       9.091e+01, 9.192e+01, 9.293e+01, 9.394e+01, 9.495e+01, 9.596e+01,
       9.697e+01, 9.798e+01, 9.899e+01, 1.000e+02]),
                                        'class_weight': ['balanced', None],
                                        'penalty': ['l1', 'l2']},
                   scoring='roc_auc', verbose=20)

In [8]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.6678406754994936
confusion_matrix: 
 [[9297 6630]
 [1233 2738]]
classification_report: 
               precision    recall  f1-score   support

           0       0.88      0.58      0.70     15927
           1       0.29      0.69      0.41      3971

    accuracy                           0.60     19898
   macro avg       0.59      0.64      0.56     19898
weighted avg       0.77      0.60      0.64     19898



In [10]:
# RandomForest classifier

# model development

params = {'n_estimators':[100, 500],
          'min_samples_split':[2, 5, None],
          'min_samples_leaf':[1,10,100]}

model = RandomForestClassifier()
rf_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'min_samples_leaf': [1, 100, 100],
                                        'min_samples_split': [2, 5, None],
                                        'n_estimators': [100, 500]},
                   scoring='roc_auc', verbose=20)

In [11]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = rf_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = rf_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7224646455370533
confusion_matrix: 
 [[9099 6828]
 [ 882 3089]]
classification_report: 
               precision    recall  f1-score   support

           0       0.91      0.57      0.70     15927
           1       0.31      0.78      0.44      3971

    accuracy                           0.61     19898
   macro avg       0.61      0.67      0.57     19898
weighted avg       0.79      0.61      0.65     19898



In [15]:
# GradientBoostingClassifier classifier

# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.1, 0.01],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [2, 5],
         'tol': [0.001, 0.0001]}

model = GradientBoostingClassifier()
gb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  2

RandomizedSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.01],
                                        'min_samples_leaf': [1, 100, 100],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [100, 500],
                                        'tol': [0.001, 0.0001]},
                   scoring='roc_auc', verbose=20)

In [16]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = gb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = gb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7305387538653163
confusion_matrix: 
 [[8840 7087]
 [ 790 3181]]
classification_report: 
               precision    recall  f1-score   support

           0       0.92      0.56      0.69     15927
           1       0.31      0.80      0.45      3971

    accuracy                           0.60     19898
   macro avg       0.61      0.68      0.57     19898
weighted avg       0.80      0.60      0.64     19898



In [19]:
# Stacking Classifier

estimators = [('rf', rf_model),
              ('gb', gb_model)]
final_estimator = lr_model
              
stacking_model = StackingClassifier(estimators = estimators, 
                                  final_estimator = final_estimator)
stacking_model.fit(x_train,
                   y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  2

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  2

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0180s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0590s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jo

StackingClassifier(estimators=[('rf',
                                RandomizedSearchCV(cv=10,
                                                   estimator=RandomForestClassifier(),
                                                   n_jobs=-1,
                                                   param_distributions={'min_samples_leaf': [1,
                                                                                             100,
                                                                                             100],
                                                                        'min_samples_split': [2,
                                                                                              5,
                                                                                              None],
                                                                        'n_estimators': [100,
                                                                         

In [20]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = stacking_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = stacking_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7307894253808498
confusion_matrix: 
 [[9264 6663]
 [ 918 3053]]
classification_report: 
               precision    recall  f1-score   support

           0       0.91      0.58      0.71     15927
           1       0.31      0.77      0.45      3971

    accuracy                           0.62     19898
   macro avg       0.61      0.68      0.58     19898
weighted avg       0.79      0.62      0.66     19898



In [21]:
from sklearn.naive_bayes import GaussianNB

# model development

params = {'var_smoothing':[1e-08, 1e-09]}

model = GaussianNB()
nb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
nb_model.fit(x_train,
             y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    8.2s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:    8.3s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  20 

RandomizedSearchCV(cv=10, estimator=GaussianNB(), n_jobs=-1,
                   param_distributions={'var_smoothing': [1e-08, 1e-09]},
                   scoring='roc_auc', verbose=20)

In [22]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = nb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = nb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.5885477759844134
confusion_matrix: 
 [[8277 7650]
 [1351 2620]]
classification_report: 
               precision    recall  f1-score   support

           0       0.86      0.52      0.65     15927
           1       0.26      0.66      0.37      3971

    accuracy                           0.55     19898
   macro avg       0.56      0.59      0.51     19898
weighted avg       0.74      0.55      0.59     19898



In [None]:
# predicted_train = gb_model.predict_proba(x_train)[:,1]


submissions=pd.DataFrame({'Junk':gb_model.predict_proba(x_test)[:,1]})
submissions.to_csv('submission_3.csv',index=False)