import dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import fbeta_score
from sklearn.utils import resample
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import LinearSVC

read data


In [3]:
x_train = pd.read_csv("data/processed_data/x_train.csv")
x_val = pd.read_csv("data/processed_data/x_val.csv")
y_train = pd.read_csv("data/processed_data/y_train.csv")
y_val = pd.read_csv("data/processed_data/y_val.csv")
x_test = pd.read_csv("data/processed_data/x_test.csv")

Balancing the train dataset by upsampling the majority class

In [3]:
y_train.value_counts()

CARAVAN Number of mobile home policies
0.0                                       4390
1.0                                        267
dtype: int64

In [4]:
# combine x and y train datasets
train_data = pd.concat([x_train, y_train], axis=1)
# Separate majority and minority classes
df_majority = train_data[train_data["CARAVAN Number of mobile home policies"]==0]
df_minority = train_data[train_data["CARAVAN Number of mobile home policies"]==1]
 
# Downsample majority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample without replacement 
                                 n_samples=1000,   # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])
 
# Display new class counts
df_upsampled["CARAVAN Number of mobile home policies"].value_counts()

# Splitting df_downsampled into x and y
target ='CARAVAN Number of mobile home policies'
x_train = df_upsampled.drop([target],1)
y_train = df_upsampled[target]

# RandomForest classifier


In [5]:
# model development

params = {'n_estimators':[100, 500, 1000],
          'min_samples_split':[2, 5, None],
          'min_samples_leaf':[1,10,100]}

model = RandomForestClassifier()
rf_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')

rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  7.5min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [2, 5, None],
                         'n_estimators': [100, 500, 1000]},
             scoring='f1', verbose=5)

In [6]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = rf_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = rf_model.predict_proba(x_val)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 0.3104359313077939
roc_auc_score: 0.6479203680925698
confusion_matrix: 
 [[698 386]
 [ 34  47]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.95      0.64      0.77      1084
         1.0       0.11      0.58      0.18        81

    accuracy                           0.64      1165
   macro avg       0.53      0.61      0.48      1165
weighted avg       0.89      0.64      0.73      1165



In [7]:
pd.Series(val_classes).value_counts()

0    732
1    433
dtype: int64

# GradientBoosting Classifier


In [8]:

# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.01, 0.001],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [5, 10],
         'tol': [0.001, 0.0001]}


model = GradientBoostingClassifier()
gb_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 13.9min finished


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.001],
                         'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 500], 'tol': [0.001, 0.0001]},
             scoring='f1', verbose=5)

In [9]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = gb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = gb_model.predict_proba(x_val)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 0.3895274584929757
roc_auc_score: 0.7776638877499886
confusion_matrix: 
 [[686 398]
 [ 20  61]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.97      0.63      0.77      1084
         1.0       0.13      0.75      0.23        81

    accuracy                           0.64      1165
   macro avg       0.55      0.69      0.50      1165
weighted avg       0.91      0.64      0.73      1165



In [10]:
pd.Series(val_classes).value_counts()

0    706
1    459
dtype: int64

# Logistic regression

In [11]:

# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()

lr_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')

lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 984 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 1236 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1632 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2100 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2640 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3252 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3936 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  4.0min finished


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([ 0.01      ,  0.11090909,  0.21181818,  0.31272727,  0.41363636,
        0.51454545,  0.61545455,  0.71636364,  0.81727273,  0.91818182,
        1.01909091,  1.12      ,  1.22090909,  1.32181818,  1.42272727,
        1.52363636,  1.62454545,  1.72545455,  1.82636364,  1.92727273,
        2.02818182,  2.12909091,  2.23      ,  2.33090909,  2.43181818,
        2.53272727...
        7.07363636,  7.17454545,  7.27545455,  7.37636364,  7.47727273,
        7.57818182,  7.67909091,  7.78      ,  7.88090909,  7.98181818,
        8.08272727,  8.18363636,  8.28454545,  8.38545455,  8.48636364,
        8.58727273,  8.68818182,  8.78909091,  8.89      ,  8.99090909,
        9.09181818,  9.19272727,  9.29363636,  9.39454545,  9.49545455,
        9.59636364,  9.69727273,  9.79818182,  9.89909091, 10.        ]),
                         'class_weight': ['balanced', None],
                         'pena

In [12]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 0.3931924882629109
roc_auc_score: 0.7650733451778963
confusion_matrix: 
 [[623 461]
 [ 14  67]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.98      0.57      0.72      1084
         1.0       0.13      0.83      0.22        81

    accuracy                           0.59      1165
   macro avg       0.55      0.70      0.47      1165
weighted avg       0.92      0.59      0.69      1165



# SVM SGD Classifier

In [28]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

params = {'penalty':['l1','l2'],
          'tol':[0.001, 0.0001],
          'alpha': [0.0001, 0.00001],
         'max_iter': [500, 1000, 1500]}

model = SGDClassifier(loss='hinge')

svm_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='f1')
svm_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   38.7s finished


GridSearchCV(cv=10, estimator=SGDClassifier(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 1e-05],
                         'max_iter': [500, 1000, 1500], 'penalty': ['l1', 'l2'],
                         'tol': [0.001, 0.0001]},
             scoring='f1', verbose=5)

In [29]:
# selecting the cut-off for determining hardclasses

# prediction on validation dataset
train_classes = svm_model.predict(x_train_scaled)

fbeta=fbeta_score(y_train, 
                 train_classes,
                  beta=2)
cm = confusion_matrix(y_train, 
                     train_classes)
class_report = classification_report(y_train, 
                                    train_classes)

print("**********Train***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_train, train_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

# prediction on validation dataset

val_classes = svm_model.predict(x_val_scaled)

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("**********Validation***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, val_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


**********Train***********
fbeta: 0.17458100558659218
roc_auc_score: 0.558371298405467
confusion_matrix: 
 [[4244  146]
 [ 850  150]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.83      0.97      0.89      4390
         1.0       0.51      0.15      0.23      1000

    accuracy                           0.82      5390
   macro avg       0.67      0.56      0.56      5390
weighted avg       0.77      0.82      0.77      5390

**********Validation***********
fbeta: 0.08356545961002786
roc_auc_score: 0.5236606532731993
confusion_matrix: 
 [[1055   29]
 [  75    6]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      1084
         1.0       0.17      0.07      0.10        81

    accuracy                           0.91      1165
   macro avg       0.55      0.52      0.53      1165
weighted avg       0.88      0.91      0.89      1165



# Linear Support Vector Classification

In [37]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# model development

params = {'tol':[0.0001, 0.00001],
         'max_iter': [500, 1000, 1500]}

model = LinearSVC()

svc_svm_model = GridSearchCV(model,
                            cv=10,
                            param_grid=params,
                            n_jobs=-1,
                            verbose=5,
                            scoring='f1')
svc_svm_model.fit(x_train_scaled, 
                  y_train.values.ravel())

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


GridSearchCV(cv=10, estimator=LinearSVC(), n_jobs=-1,
             param_grid={'max_iter': [500, 1000, 1500], 'tol': [0.0001, 1e-05]},
             scoring='f1', verbose=5)

In [39]:
# selecting the cut-off for determining hardclasses

# prediction on validation dataset
train_classes = svc_svm_model.predict(x_train_scaled)

fbeta=fbeta_score(y_train, 
                 train_classes,
                  beta=2)
cm = confusion_matrix(y_train, 
                     train_classes)
class_report = classification_report(y_train, 
                                    train_classes)

print("**********Train***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_train, train_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

# prediction on validation dataset

val_classes = svc_svm_model.predict(x_val_scaled)

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("**********Validation***********")
print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, val_classes)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


**********Train***********
fbeta: 0.20248127340823968
roc_auc_score: 0.5752243735763098
confusion_matrix: 
 [[4291   99]
 [ 827  173]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.84      0.98      0.90      4390
         1.0       0.64      0.17      0.27      1000

    accuracy                           0.83      5390
   macro avg       0.74      0.58      0.59      5390
weighted avg       0.80      0.83      0.79      5390

**********Validation***********
fbeta: 0.07122507122507121
roc_auc_score: 0.5207165960548495
confusion_matrix: 
 [[1062   22]
 [  76    5]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.93      0.98      0.96      1084
         1.0       0.19      0.06      0.09        81

    accuracy                           0.92      1165
   macro avg       0.56      0.52      0.52      1165
weighted avg       0.88      0.92      0.90      1165



# Stacking Classifier


In [30]:
estimators = [('gb', gb_model),
             ('lr', lr_model)]
final_estimator = lr_model
              
stacking_model = StackingClassifier(estimators = estimators, 
                                  final_estimator = final_estimator)
stacking_model.fit(x_train,
                   y_train.values.ravel())

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 14.7min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 462 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 700 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done 1420 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1888 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2428 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 3040 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3656 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  5.2min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 11.5min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 11.9min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 11.5min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 11.5min finished


Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 11.4min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 1328 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1724 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2732 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3344 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3993 out of 4000 | elapsed:  3.9min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  3.9min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 912 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done 1232 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 1628 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2096 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2636 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3248 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3932 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  4.0min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation f

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 1071 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 1328 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1724 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2732 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3344 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3993 out of 4000 | elapsed:  3.9min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  3.9min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 1416 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1812 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2820 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3432 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  3.9min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 1364 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1760 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2228 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2768 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3380 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  3.9min finished


Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1104 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 2112 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 3408 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:   33.2s finished


StackingClassifier(estimators=[('gb',
                                GridSearchCV(cv=10,
                                             estimator=GradientBoostingClassifier(),
                                             n_jobs=-1,
                                             param_grid={'learning_rate': [0.01,
                                                                           0.001],
                                                         'min_samples_leaf': [1,
                                                                              10,
                                                                              100],
                                                         'min_samples_split': [5,
                                                                               10],
                                                         'n_estimators': [100,
                                                                          500],
                                 

In [40]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = stacking_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# fbetas statistic 

cutoffs = np.linspace(0.001,0.999,999)
fbetas=[]
for cutoff in cutoffs:    
    predicted=(predicted_train>cutoff).astype(int)  
    fbetas.append(fbeta_score(actual, predicted, beta=2))
    
# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[fbetas == max(fbetas)][0]
predicted_val = stacking_model.predict_proba(x_val_scaled)[:,1]

val_classes = (predicted_val>cutoff_optimum).astype(int)
# pd.Series(val_classes).value_counts()

fbeta=fbeta_score(y_val, 
                 val_classes,
                  beta=2)
cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)

print("fbeta:", fbeta)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)


fbeta: 0.40045766590389004
roc_auc_score: 0.7663830804974716
confusion_matrix: 
 [[604 480]
 [ 11  70]]
classification_report: 
               precision    recall  f1-score   support

         0.0       0.98      0.56      0.71      1084
         1.0       0.13      0.86      0.22        81

    accuracy                           0.58      1165
   macro avg       0.55      0.71      0.47      1165
weighted avg       0.92      0.58      0.68      1165



In [41]:
# pd.Series(y_val).value_counts()
# pd.Series(val_classes).value_counts()

0    615
1    550
dtype: int64

Final predictions using stacking model based on fbeta score

In [33]:
predictions=(stacking_model.predict_proba(x_test)[:,1]>cutoff_optimum).astype(int)

In [34]:
# pd.Series(predictions).value_counts()

1    2268
0    1732
dtype: int64

In [35]:
submissions=pd.DataFrame({'V86':predictions})
submissions.to_csv('output/prediction_submission_2.csv',index=False)