import dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import fbeta_score
from sklearn.utils import resample
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import LinearSVC

read data


In [2]:
x_train = pd.read_csv("../"data/processed_data/x_train.csv")
x_val = pd.read_csv("../data/processed_data/x_val.csv")
y_train = pd.read_csv("../data/processed_data/y_train.csv")
y_val = pd.read_csv("../data/processed_data/y_val.csv")
x_test = pd.read_csv("../data/processed_data/x_test.csv")
y_test = pd.read_csv("../data/processed_data/y_test.csv")

In [3]:
# x_train = np.load("../data/processed_data/x_train.npy")
# x_val = np.load("../data/processed_data/x_val.npy")
# y_train = np.load("../data/processed_data/y_train.npy")
# y_val = np.load("../data/processed_data/y_val.npy")
# x_test = np.load("../data/processed_data/x_test.npy")
# y_test = np.load("../data/processed_data/y_test.npy")

In [4]:
x_val.shape

(15199, 102)

Balancing the train dataset by upsampling the majority class

In [5]:
x_train.shape

(50663, 102)

# RandomForest classifier


In [6]:
# model development

params = {'n_estimators':[100, 500, 1000],
          'min_samples_split':[2, 5, None],
          'min_samples_leaf':[1,10,100]}

model = RandomForestClassifier()
rf_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')

rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 25.3min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 35.9min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [2, 5, None],
                         'n_estimators': [100, 500, 1000]},
             scoring='f1', verbose=5)

In [7]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = rf_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = rf_model.predict_proba(x_val)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                    beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 0.9957184108334165
roc_auc_score: 1.0
confusion_matrix: 
 [[7027  172]
 [   0 8000]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      7199
           1       0.98      1.00      0.99      8000

    accuracy                           0.99     15199
   macro avg       0.99      0.99      0.99     15199
weighted avg       0.99      0.99      0.99     15199



In [8]:
predictions=(rf_model.predict_proba(x_test)[:,1]>cutoff_optimum).astype(int)
fbeta=fbeta_score(y_test, 
                 predictions,
                    beta=2)
cm = confusion_matrix(y_test, 
                 predictions)
class_report = classification_report(y_test, 
                 predictions)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_test, predictions)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

fbeta: 0.9960494974728402
roc_auc_score: 0.9889825016202203
confusion_matrix: 
 [[3018   68]
 [   0 3429]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      3086
           1       0.98      1.00      0.99      3429

    accuracy                           0.99      6515
   macro avg       0.99      0.99      0.99      6515
weighted avg       0.99      0.99      0.99      6515



In [9]:
pd.Series(val_classes).value_counts()

1    8172
0    7027
dtype: int64

# GradientBoosting Classifier


In [10]:
# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.01, 0.001],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [5, 10],
         'tol': [0.001, 0.0001]}


model = GradientBoostingClassifier()
gb_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 37.8min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 69.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 110.5min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 120.4min finished


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.001],
                         'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 500], 'tol': [0.001, 0.0001]},
             scoring='f1', verbose=5)

In [11]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = gb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = gb_model.predict_proba(x_val)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 1.0
roc_auc_score: 1.0
confusion_matrix: 
 [[7199    0]
 [   0 8000]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7199
           1       1.00      1.00      1.00      8000

    accuracy                           1.00     15199
   macro avg       1.00      1.00      1.00     15199
weighted avg       1.00      1.00      1.00     15199



In [12]:
pd.Series(val_classes).value_counts()

1    8000
0    7199
dtype: int64

# Logistic regression

In [13]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()

lr_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')

lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | el

GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([ 0.01      ,  0.11090909,  0.21181818,  0.31272727,  0.41363636,
        0.51454545,  0.61545455,  0.71636364,  0.81727273,  0.91818182,
        1.01909091,  1.12      ,  1.22090909,  1.32181818,  1.42272727,
        1.52363636,  1.62454545,  1.72545455,  1.82636364,  1.92727273,
        2.02818182,  2.12909091,  2.23      ,  2.33090909,  2.43181818,
        2.53272727...
        7.07363636,  7.17454545,  7.27545455,  7.37636364,  7.47727273,
        7.57818182,  7.67909091,  7.78      ,  7.88090909,  7.98181818,
        8.08272727,  8.18363636,  8.28454545,  8.38545455,  8.48636364,
        8.58727273,  8.68818182,  8.78909091,  8.89      ,  8.99090909,
        9.09181818,  9.19272727,  9.29363636,  9.39454545,  9.49545455,
        9.59636364,  9.69727273,  9.79818182,  9.89909091, 10.        ]),
                         'class_weight': ['balanced', None],
                         'pena

In [14]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 0.9999750006249843
roc_auc_score: 1.0
confusion_matrix: 
 [[7198    1]
 [   0 8000]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7199
           1       1.00      1.00      1.00      8000

    accuracy                           1.00     15199
   macro avg       1.00      1.00      1.00     15199
weighted avg       1.00      1.00      1.00     15199



In [15]:
predictions=(lr_model.predict_proba(x_test)[:,1]>cutoff_optimum).astype(int)
fbeta=fbeta_score(y_test, 
                 predictions,
                    beta=2)
cm = confusion_matrix(y_test, 
                 predictions)
class_report = classification_report(y_test, 
                 predictions)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_test, predictions)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

fbeta: 0.8474618160249122
roc_auc_score: 0.5
confusion_matrix: 
 [[   0 3086]
 [   0 3429]]
classification_report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      3086
           1       0.53      1.00      0.69      3429

    accuracy                           0.53      6515
   macro avg       0.26      0.50      0.34      6515
weighted avg       0.28      0.53      0.36      6515



  _warn_prf(average, modifier, msg_start, len(result))


# SVM SGD Classifier

In [16]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

params = {'penalty':['l1','l2'],
          'tol':[0.001, 0.0001],
          'alpha': [0.0001, 0.00001],
         'max_iter': [500, 1000, 1500]}

model = SGDClassifier(loss='hinge')

svm_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')
svm_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.2min finished


GridSearchCV(cv=10, estimator=SGDClassifier(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 1e-05],
                         'max_iter': [500, 1000, 1500], 'penalty': ['l1', 'l2'],
                         'tol': [0.001, 0.0001]},
             scoring='f1', verbose=5)

In [17]:
# selecting the cut-off for determining hardclasses

# prediction on validation dataset
train_classes = svm_model.predict(x_train_scaled)

fbeta=fbeta_score(y_train, 
                 train_classes,
                  beta=2)
cm = confusion_matrix(y_train, 
                     train_classes)
class_report = classification_report(y_train, 
                                    train_classes)

print("**********Train***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_train, train_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

# prediction on validation dataset

val_classes = svm_model.predict(x_val_scaled)

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("**********Validation***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, val_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


**********Train***********
fbeta: 1.0
roc_auc_score: 1.0
confusion_matrix: 
 [[23998     0]
 [    0 26665]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23998
           1       1.00      1.00      1.00     26665

    accuracy                           1.00     50663
   macro avg       1.00      1.00      1.00     50663
weighted avg       1.00      1.00      1.00     50663

**********Validation***********
fbeta: 1.0
roc_auc_score: 1.0
confusion_matrix: 
 [[7199    0]
 [   0 8000]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7199
           1       1.00      1.00      1.00      8000

    accuracy                           1.00     15199
   macro avg       1.00      1.00      1.00     15199
weighted avg       1.00      1.00      1.00     15199



# Linear Support Vector Classification

In [18]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

params = {'tol':[0.0001, 0.00001],
         'max_iter': [500, 1000, 1500]}

model = LinearSVC()

svc_svm_model = GridSearchCV(model,
                            cv=10,
                            param_grid=params,
                            n_jobs=-1,
                            verbose=5,
                            scoring='f1')
svc_svm_model.fit(x_train_scaled, 
                  y_train.values.ravel())

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 16.3min finished


GridSearchCV(cv=10, estimator=LinearSVC(), n_jobs=-1,
             param_grid={'max_iter': [500, 1000, 1500], 'tol': [0.0001, 1e-05]},
             scoring='f1', verbose=5)

In [19]:
# selecting the cut-off for determining hardclasses

# prediction on validation dataset
train_classes = svc_svm_model.predict(x_train_scaled)

fbeta=fbeta_score(y_train, 
                 train_classes,
                  beta=2)
cm = confusion_matrix(y_train, 
                     train_classes)
class_report = classification_report(y_train, 
                                    train_classes)

print("**********Train***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_train, train_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

# prediction on validation dataset

val_classes = svc_svm_model.predict(x_val_scaled)

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("**********Validation***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, val_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


**********Train***********
fbeta: 1.0
roc_auc_score: 1.0
confusion_matrix: 
 [[23998     0]
 [    0 26665]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23998
           1       1.00      1.00      1.00     26665

    accuracy                           1.00     50663
   macro avg       1.00      1.00      1.00     50663
weighted avg       1.00      1.00      1.00     50663

**********Validation***********
fbeta: 1.0
roc_auc_score: 1.0
confusion_matrix: 
 [[7199    0]
 [   0 8000]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7199
           1       1.00      1.00      1.00      8000

    accuracy                           1.00     15199
   macro avg       1.00      1.00      1.00     15199
weighted avg       1.00      1.00      1.00     15199



# Stacking Classifier


In [20]:
estimators = [('gb', gb_model),
             ('lr', lr_model)]
final_estimator = lr_model
              
stacking_model = StackingClassifier(estimators = estimators, 
                                  final_estimator = final_estimator)
stacking_model.fit(x_train,
                   y_train.values.ravel())

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 69.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 110.3min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 120.4min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 31.3min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 36.4min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | el

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 53.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 83.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 91.2min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 53.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 84.0min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 91.4min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 53.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 83.8min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 91.1min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 53.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 84.3min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 91.7min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 59.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 104.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 116.6min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 43.0min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | el

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 38.5min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 45.0min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | el

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 40.6min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 47.3min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 52.8min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increa

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 22.6min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 36.0min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 34.0min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | el

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 2520 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 3456 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  2.2min finished


StackingClassifier(estimators=[('gb',
                                GridSearchCV(cv=10,
                                             estimator=GradientBoostingClassifier(),
                                             n_jobs=-1,
                                             param_grid={'learning_rate': [0.01,
                                                                           0.001],
                                                         'min_samples_leaf': [1,
                                                                              10,
                                                                              100],
                                                         'min_samples_split': [5,
                                                                               10],
                                                         'n_estimators': [100,
                                                                          500],
                                 

In [21]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = stacking_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = stacking_model.predict_proba(x_val_scaled)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 1.0
roc_auc_score: 1.0
confusion_matrix: 
 [[7199    0]
 [   0 8000]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7199
           1       1.00      1.00      1.00      8000

    accuracy                           1.00     15199
   macro avg       1.00      1.00      1.00     15199
weighted avg       1.00      1.00      1.00     15199



In [22]:
# pd.Series(y_val).value_counts()
# pd.Series(val_classes).value_counts()

Final predictions using stacking model based on fbeta score

In [23]:
predictions=(stacking_model.predict_proba(x_test)[:,1]>cutoff_optimum).astype(int)

In [24]:
# pd.Series(predictions).value_counts()

In [26]:
# submissions=pd.DataFrame({'V86':predictions})
# submissions.to_csv('output/prediction_submission_2.csv',index=False)

In [45]:
def save_pickle_model(model_parameters, model_path, file_opts):
    # Save the model to disk
    with open(model_path, file_opts) as pickle_out:
        pickle.dump(model_parameters, pickle_out)

def load_pickle_model(model_path, file_opts):
    # Load the model from disk
    with open(model_path, file_opts) as pickle_in:
        return pickle.load(pickle_in)

In [54]:
import os

output_path = "../output"
model_path = os.path.join(output_path, "model", "artifact")
if not os.path.exists(model_path):
    os.makedirs(model_path)  
    
# save models

save_pickle_model(rf_model.best_estimator_, os.path.join(
    model_path, "rf_model.pickle"), "wb") 
save_pickle_model(gb_model.best_estimator_, os.path.join(
    model_path, "gb_model.pickle"), "wb") 
save_pickle_model(lr_model.best_estimator_, os.path.join(
    model_path, "lr_model.pickle"), "wb") 
save_pickle_model(svm_model.best_estimator_, os.path.join(
    model_path, "svm_model.pickle"), "wb") 
save_pickle_model(svc_svm_model.best_estimator_, os.path.join(
    model_path, "svc_svm_model.pickle"), "wb") 
save_pickle_model(stacking_model, os.path.join(
    model_path, "stacking_model.pickle"), "wb") 


AttributeError: 'RandomForestClassifier' object has no attribute 'best_estimator_'

In [55]:
# load models

            
rf_model = load_pickle_model(os.path.join(
    model_path, "rf_model.pickle"), "rb")







