In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#model processing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

#models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

#evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, matthews_corrcoef

#model deployment
import pickle


In [2]:
from google.colab import files
uploaded = files.upload()

Saving bank_dash.csv to bank_dash.csv


In [0]:
import io
bank_dash = pd.read_csv(io.BytesIO(uploaded['bank_dash.csv']))

In [0]:
bank_dash.head()

Unnamed: 0,Age,Job,Marital Status,Education,Default,Housing Loan,Personal Loan,Contact Type,Month,Day of Week,Last Call Duration,Calls in this Campaign,Days passed after previous campaign,Calls in previous Campaign,Previous Campaign Outcome,Employment Variation Rate,Consumer Price Index,Consumer Confidence Index,Euribor 3M,Number of people employed,y
0,56,Housemaid,Married,Basic 4y,No,No,No,Telephone,May,Mon,261,1,999,0,Nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,Services,Married,High school,Unknown,No,No,Telephone,May,Mon,149,1,999,0,Nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,Services,Married,High school,No,Yes,No,Telephone,May,Mon,226,1,999,0,Nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,Admin.,Married,Basic 6y,No,No,No,Telephone,May,Mon,151,1,999,0,Nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,Services,Married,High school,No,No,Yes,Telephone,May,Mon,307,1,999,0,Nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [0]:
target = bank_dash['y'].apply(lambda x: 1 if x == 'yes' else 0)

### Let's see what if we used only customer related features to predict the outcome

In [0]:
# Initiate the model
log_reg = LogisticRegression(solver = 'saga', penalty = 'l1', class_weight = 'balanced', max_iter = 1500, C = 1)

In [0]:
bank_fin1 = bank_dash[['Job', 'Marital Status', 'Education', 'Contact Type', 'Age']]

In [0]:
fin1 = pd.get_dummies(bank_fin1, columns = bank_fin1.select_dtypes(exclude = 'number').columns)

In [0]:
fin1.head()

Unnamed: 0,Age,Job_Admin.,Job_Blue-collar,Job_Entrepreneur,Job_Housemaid,Job_Management,Job_Retired,Job_Self-employed,Job_Services,Job_Student,Job_Technician,Job_Unemployed,Job_Unknown,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Unknown,Education_Basic 4y,Education_Basic 6y,Education_Basic 9y,Education_High school,Education_Illiterate,Education_Professional course,Education_University degree,Education_Unknown,Contact Type_Cellular,Contact Type_Telephone
0,56,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
1,57,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
2,37,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
3,40,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
4,56,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1


In [0]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(fin1, target, test_size = 0.3, random_state = 101)

In [0]:
# Scaling the X_train and transforming the X_test -- these are the ones we will use
scaler = StandardScaler()

X_trainscale = scaler.fit_transform(X_train2)
X_testscale = scaler.transform(X_test2)

In [0]:
log_reg.fit(X_trainscale, y_train2)



LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
train_pred = log_reg.predict(X_trainscale)

In [0]:
f1_score(y_train2, train_pred)

0.274804641250296

In [0]:
test_pred = log_reg.predict(X_testscale)

In [0]:
print(classification_report(y_test2, test_pred))

              precision    recall  f1-score   support

           0       0.93      0.55      0.69     10978
           1       0.16      0.69      0.26      1379

    accuracy                           0.57     12357
   macro avg       0.55      0.62      0.48     12357
weighted avg       0.85      0.57      0.64     12357



In [0]:
print(confusion_matrix(y_test2, test_pred))

[[6037 4941]
 [ 429  950]]


Here we can see that actually our model performs poorly with only using these features. I think for now we need to tune model once again.

In [0]:
# Logistic Regression

solver = ['saga']
max_iter = [1000, 1500, 2000, 3000]
penalty = ['l1', 'l2']
class_weight = [None, 'balanced', {0: 0.5, 1: 5}]
C = [0.01, 0.1, 1, 10,]

log_reg_param = {'solver': solver,
                 'max_iter': max_iter,
                 'penalty': penalty,
                 'class_weight': class_weight,
                 'C': C}

In [0]:
# Let's use original logistic regression

log_reg = LogisticRegression()

In [0]:
# randomized

log_random= RandomizedSearchCV(estimator = log_reg,
                              param_distributions = log_reg_param,
                              n_iter = 15, cv =5, n_jobs = -1, scoring = 'f1')

In [0]:
log_random.fit(X_trainscale, y_train2)



RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=15, n_jobs=-1,
                   param_distributions={'C': [0.01, 0.1, 1, 10],
                                        'class_weight': [None, 'balanced',
                                                         {0: 0.5, 1: 5}],
                                        'max_iter': [1000, 1500, 2000

In [0]:
log_random.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'max_iter': 3000,
 'penalty': 'l1',
 'solver': 'saga'}

In [0]:
log_random.best_score_

0.2733478384995712

In [0]:
log_1 = log_random.predict(X_trainscale)

In [0]:
f1_score(y_train2, log_1)

0.2747025042922266

In [0]:
log_test = log_random.predict(X_testscale)

In [0]:
f1_score(y_test2, log_test)

0.2614558965185083

In [0]:
print(confusion_matrix(y_test2, log_test))

[[6040 4938]
 [ 429  950]]


In [0]:
print(classification_report(y_test2, log_test))

              precision    recall  f1-score   support

           0       0.93      0.55      0.69     10978
           1       0.16      0.69      0.26      1379

    accuracy                           0.57     12357
   macro avg       0.55      0.62      0.48     12357
weighted avg       0.85      0.57      0.64     12357



I think it's really extreme to only use the customer's characteristic. Let's also include the socio-economic features -- and fit it into our original log_reg (not the last tuned one)

In [0]:
bank_dash.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
Age                                    41188 non-null int64
Job                                    41188 non-null object
Marital Status                         41188 non-null object
Education                              41188 non-null object
Default                                41188 non-null object
Housing Loan                           41188 non-null object
Personal Loan                          41188 non-null object
Contact Type                           41188 non-null object
Month                                  41188 non-null object
Day of Week                            41188 non-null object
Last Call Duration                     41188 non-null int64
Calls in this Campaign                 41188 non-null int64
Days passed after previous campaign    41188 non-null int64
Calls in previous Campaign             41188 non-null int64
Previous Campaign Outcome              411

In [0]:
bank_fin2 = bank_dash[['Job', 'Marital Status', 'Education', 'Age', 'Euribor 3M', 'Consumer Price Index', 'Consumer Confidence Index']]

In [0]:
fin2 = pd.get_dummies(bank_fin2)

In [0]:
fin2.head()

Unnamed: 0,Age,Euribor 3M,Consumer Price Index,Consumer Confidence Index,Job_Admin.,Job_Blue-collar,Job_Entrepreneur,Job_Housemaid,Job_Management,Job_Retired,Job_Self-employed,Job_Services,Job_Student,Job_Technician,Job_Unemployed,Job_Unknown,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Unknown,Education_Basic 4y,Education_Basic 6y,Education_Basic 9y,Education_High school,Education_Illiterate,Education_Professional course,Education_University degree,Education_Unknown
0,56,4.857,93.994,-36.4,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
1,57,4.857,93.994,-36.4,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
2,37,4.857,93.994,-36.4,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,40,4.857,93.994,-36.4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
4,56,4.857,93.994,-36.4,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [0]:
# Initiate the model
log_reg = LogisticRegression(solver = 'saga', penalty = 'l1', class_weight = 'balanced', max_iter = 1500, C = 1)

In [0]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(fin2, target, test_size = 0.3, random_state = 101)

In [0]:
X_train3scale = scaler.fit_transform(X_train3)
X_test3scale = scaler.transform(X_test3)

In [0]:
log_reg.fit(X_train3scale, y_train3)

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
train3_pred = log_reg.predict(X_train3scale)

In [0]:
f1_score(y_train3, train3_pred)

0.3629820855824141

In [0]:
test3_pred = log_reg.predict(X_test3scale)

In [0]:
print(confusion_matrix(y_test3, test3_pred))

[[8011 2967]
 [ 430  949]]


In [0]:
print(classification_report(y_test3, test3_pred))

              precision    recall  f1-score   support

           0       0.95      0.73      0.83     10978
           1       0.24      0.69      0.36      1379

    accuracy                           0.73     12357
   macro avg       0.60      0.71      0.59     12357
weighted avg       0.87      0.73      0.77     12357



By putting socio-economic features, turns out our f1 score increased, the precision score is increased too. Our True Positives are the same as before, but this model did less misclassifications instead of the previous model -- It decreased 2000 misclassifications of False Positives.


In [0]:
## I am curious, let's tune once again

log_random.fit(X_train3scale, y_train3)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=15, n_jobs=-1,
                   param_distributions={'C': [0.01, 0.1, 1, 10],
                                        'class_weight': [None, 'balanced',
                                                         {0: 0.5, 1: 5}],
                                        'max_iter': [1000, 1500, 2000

In [0]:
log_random.best_score_

0.36407373105955976

In [0]:
train4_pred = log_random.predict(X_train3scale)

In [0]:
f1_score(y_train3, train4_pred)

0.3629820855824141

In [0]:
print(classification_report(y_train3, train4_pred))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82     25570
           1       0.24      0.71      0.36      3261

    accuracy                           0.72     28831
   macro avg       0.60      0.71      0.59     28831
weighted avg       0.87      0.72      0.77     28831



In [0]:
print(confusion_matrix(y_train3, train4_pred))

[[18368  7202]
 [  941  2320]]


In [0]:
test4_pred = log_random.predict(X_test3scale)

In [0]:
f1_score(y_test3, test4_pred)

0.3583160279403436

In [0]:
print(classification_report(y_test3, test4_pred))

              precision    recall  f1-score   support

           0       0.95      0.73      0.82     10978
           1       0.24      0.69      0.36      1379

    accuracy                           0.72     12357
   macro avg       0.60      0.71      0.59     12357
weighted avg       0.87      0.72      0.77     12357



In [0]:
print(confusion_matrix(y_test3, test4_pred))

[[8009 2969]
 [ 430  949]]


In [0]:
log_random.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'max_iter': 1000,
 'penalty': 'l2',
 'solver': 'saga'}

It does not seem to be better than our original tuned model. So we will just keep the original tuned model.
Based on this model, across all positive cases, our model can predict almost 70% accurately. However across all the positives cases the models predicted, turns out only around 30% is the true positives T__T.


In [0]:
log_reg

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

Let's try using our XGB model before

In [0]:
xgb = XGBClassifier(learning_rate = 0.2, max_depth = 5, n_estimators = 1000)

ERROR! Session/line number was not unique in database. History logging moved to new session 59


In [0]:
xgb.fit(X_train3, y_train3)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
xgb_train = xgb.predict(X_train3)

In [0]:
f1_score(y_train2, xgb_train)

0.6447981366459627

In [0]:
xgb_test = xgb.predict(X_test3)

In [0]:
print(classification_report(y_test3, xgb_test))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94     10978
           1       0.47      0.26      0.34      1379

    accuracy                           0.88     12357
   macro avg       0.69      0.61      0.64     12357
weighted avg       0.86      0.88      0.87     12357



In [0]:
print(confusion_matrix(y_test3, xgb_test))

ERROR! Session/line number was not unique in database. History logging moved to new session 60
[[10562   416]
 [ 1017   362]]


In [0]:
### How about if we use the scaled X_train like the log_reg

xgb.fit(X_train3scale, y_train3)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
xgb_trainscale = xgb.predict(X_train3scale)

In [0]:
f1_score(y_train3, xgb_trainscale)

0.6447981366459627

In [0]:
xgb_testscale = xgb.predict(X_test3scale)

In [0]:
print(confusion_matrix(y_test3, xgb_testscale))

[[10562   416]
 [ 1017   362]]


There is literally no difference in using the scaled or the original dataset T__T. But one thing we can observe here is the off-predictions are much less than the logistic regression model. Here we can see that although the True positives are less than the log_reg, the precision score is more. Across all positives predictions, almost 50% are correct. But there is a lot of False Negative, bank may lose prospective positive customers.

It actually comes back to bank strategy, whether they want to aggresively acquire customers(but they should be aware of the expense as well) -- or they want to be conservative, acquiring customers but also conserving resources. 

Based on all my explorations - EDA and machine learning using 2 models, I can conclude that for bank with aggresive strategy, it will be better to use logistic regression (higher recall), while for conservative strategy to use xgb model (higher precision).

Let's see whether our models are stable.

In [6]:
from google.colab import files
uploaded = files.upload()

Saving bankdum_final_logreg.csv to bankdum_final_logreg.csv


In [0]:
import io
bankdum_final_logreg = pd.read_csv(io.BytesIO(uploaded['bankdum_final_logreg.csv']))

In [8]:
bankdum_final_logreg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 28 columns):
Age                              41188 non-null int64
Euribor 3M                       41188 non-null float64
Consumer Price Index             41188 non-null float64
Consumer Confidence Index        41188 non-null float64
Job_Admin.                       41188 non-null int64
Job_Blue-collar                  41188 non-null int64
Job_Entrepreneur                 41188 non-null int64
Job_Housemaid                    41188 non-null int64
Job_Management                   41188 non-null int64
Job_Retired                      41188 non-null int64
Job_Self-employed                41188 non-null int64
Job_Services                     41188 non-null int64
Job_Student                      41188 non-null int64
Job_Technician                   41188 non-null int64
Job_Unemployed                   41188 non-null int64
Job_Unknown                      41188 non-null int64
Marital Status_Divorc

In [0]:
X_trainfin, X_testfin, y_trainfin, y_testfin = train_test_split(bankdum_final_logreg, target, test_size = 0.3, random_state = 101)

In [9]:
from google.colab import files
uploaded = files.upload()

Saving banktrain_oversample_xgb.csv to banktrain_oversample_xgb.csv


In [0]:
import io
banktrain_oversample_xgb= pd.read_csv(io.BytesIO(uploaded['banktrain_oversample_xgb.csv']))

In [12]:
banktrain_oversample_xgb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28831 entries, 0 to 28830
Data columns (total 29 columns):
Age                              28831 non-null int64
Euribor 3M                       28831 non-null float64
Consumer Price Index             28831 non-null float64
Consumer Confidence Index        28831 non-null float64
Job_Admin.                       28831 non-null int64
Job_Blue-collar                  28831 non-null int64
Job_Entrepreneur                 28831 non-null int64
Job_Housemaid                    28831 non-null int64
Job_Management                   28831 non-null int64
Job_Retired                      28831 non-null int64
Job_Self-employed                28831 non-null int64
Job_Services                     28831 non-null int64
Job_Student                      28831 non-null int64
Job_Technician                   28831 non-null int64
Job_Unemployed                   28831 non-null int64
Job_Unknown                      28831 non-null int64
Marital Status_Divorc

In [0]:
X_train_over = banktrain_oversample_xgb.drop('y', axis = 1)

In [29]:
X_train_over

Unnamed: 0,Age,Euribor 3M,Consumer Price Index,Consumer Confidence Index,Job_Admin.,Job_Blue-collar,Job_Entrepreneur,Job_Housemaid,Job_Management,Job_Retired,Job_Self-employed,Job_Services,Job_Student,Job_Technician,Job_Unemployed,Job_Unknown,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Unknown,Education_Basic 4y,Education_Basic 6y,Education_Basic 9y,Education_High school,Education_Illiterate,Education_Professional course,Education_University degree,Education_Unknown
0,40,4.860,93.994,-36.4,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0
1,25,4.962,93.918,-42.7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,48,4.959,94.465,-41.8,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
3,43,1.327,92.893,-46.2,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,49,4.968,93.444,-36.1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28826,30,4.959,93.918,-42.7,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
28827,50,4.857,93.994,-36.4,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
28828,35,4.865,94.465,-41.8,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
28829,44,4.961,93.918,-42.7,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [0]:
y_train_over = banktrain_oversample_xgb['y']

In [28]:
y_train_over

0        0
1        0
2        0
3        0
4        0
        ..
28826    0
28827    0
28828    0
28829    0
28830    0
Name: y, Length: 28831, dtype: int64

In [0]:
# Fitting the model based on the best hyperparameter
xgb = XGBClassifier(learning_rate = 0.3, max_depth = 5, n_estimators = 1500)

In [16]:
xgb.fit(X_train_over, y_train_over)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
train_overxgb = xgb.predict(X_train_over)

In [19]:
f1_score(y_train_over, train_overxgb)

0.7103988073052554

In [0]:
test_xgb = xgb.predict(X_testfin)

In [21]:
print(classification_report(y_testfin, test_xgb))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93     10978
           1       0.44      0.28      0.34      1379

    accuracy                           0.88     12357
   macro avg       0.68      0.62      0.64     12357
weighted avg       0.86      0.88      0.87     12357



In [22]:
print(confusion_matrix(y_testfin, test_xgb))

[[10498   480]
 [  998   381]]


## XGB Model Stability
We did a lot to our logistic regression already, now let's check whether our xgb is stable

In [0]:
# Making metrics functions first

def calc_train_error(X_train, y_train, model):
    predictions = model.predict(X_train)
    predictProba = model.predict_proba(X_train)
    matt = matthews_corrcoef(y_train, predictions)
    f1 = f1_score(y_train, predictions)
    report = classification_report(y_train, predictions)
    roc_auc = roc_auc_score(y_train, predictProba[:,1])
    accuracy = accuracy_score(y_train, predictions)
    confmatrix = confusion_matrix(y_train, predictions)
    logloss = log_loss(y_train, predictProba)
    return {
        'report': report,
        'matthew':matt,
        'f1': f1,
        'roc': roc_auc,
        'accuracy': accuracy,
        'confusion': confmatrix,
        'logloss': logloss
    }
  
def calc_validation_error(X_test, y_test, model):
    predictions = model.predict(X_test)
    predictProba = model.predict_proba(X_test)
    matt = matthews_corrcoef(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictProba[:,1])
    accuracy = accuracy_score(y_test, predictions)
    confmatrix = confusion_matrix(y_test, predictions)
    logloss = log_loss(y_test, predictProba)
    return {
        'report': report,
        'matthew':matt,
        'f1': f1,
        'roc': roc_auc,
        'accuracy': accuracy,
        'confusion': confmatrix,
        'logloss': logloss
    }

def calc_metrics(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [0]:
#for xgb

k = 5
kf_xgb = KFold(n_splits = k, shuffle = True, random_state = 101)

In [0]:
train_errors = []
valid_errors = []

for train_index, val_index in kf_xgb.split(X_train_over,y_train_over):
    
    #split data
    X_trn, X_val = X_train_over.iloc[train_index], X_train_over.iloc[val_index]
    y_trn, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    #instantiate model -- taking the one with the best hyperparameter according to the randomsearch
    xgb = XGBClassifier(learning_rate = 0.3, max_depth = 5, n_estimators = 1500)

    #calculate error
    train_error, valid_error = calc_metrics(X_trn, y_trn, X_val, y_val, xgb)
    
    #append to appropriate list
    train_errors.append(train_error)
    valid_errors.append(valid_error)

In [31]:
matrix  = []
for i, j in zip(train_errors, valid_errors):
    matrix.append([i['accuracy'], j['accuracy'], i['roc'], j['roc'], i['f1'], j['f1'], 
                   i['matthew'], j['matthew'], i['logloss'], j['logloss']])

calc_matrix = pd.DataFrame(matrix, columns = ['Train Accuracy', 'Test Accuracy', 'Train ROC AUC', 'Test ROC AUC',
                                             'Train F1 Score', 'Test F1 Score', 'Train Matthews Corr Coef', 
                                              'Test Matthews Corr Coef', 'Train Log Loss', 'Test Log Loss'])
average = []
for i in calc_matrix.columns:
    average.append(calc_matrix[i].mean())
    
average_mat = pd.DataFrame(average).T
average_mat.columns = ['Train Accuracy', 'Test Accuracy', 'Train ROC AUC', 'Test ROC AUC',
                                             'Train F1 Score', 'Test F1 Score', 'Train Matthews Corr Coef', 
                                              'Test Matthews Corr Coef', 'Train Log Loss', 'Test Log Loss']
index_label = []
for i in range(1, len(calc_matrix)+1):
    index_label.append(f'{i} Iteration')
index_label.append('Average')
calculation = pd.concat([calc_matrix, average_mat])
calculation.index = index_label
calculation

Unnamed: 0,Train Accuracy,Test Accuracy,Train ROC AUC,Test ROC AUC,Train F1 Score,Test F1 Score,Train Matthews Corr Coef,Test Matthews Corr Coef,Train Log Loss,Test Log Loss
1 Iteration,0.951743,0.885729,0.975887,0.728946,0.744549,0.375355,0.73586,0.329755,0.129266,0.380448
2 Iteration,0.951658,0.8812,0.976598,0.727489,0.744442,0.37557,0.734879,0.32253,0.1266,0.389549
3 Iteration,0.951008,0.883802,0.977624,0.727709,0.743648,0.372659,0.733323,0.320102,0.126436,0.374375
4 Iteration,0.949967,0.879292,0.975258,0.726055,0.732994,0.338403,0.726142,0.285245,0.130571,0.384814
5 Iteration,0.950401,0.883455,0.976079,0.725273,0.738574,0.352601,0.729869,0.302351,0.128701,0.375622
Average,0.950956,0.882696,0.976289,0.727094,0.740841,0.362918,0.732015,0.311997,0.128315,0.380962


In [0]:
from sklearn.pipeline import Pipeline

pipe_log = Pipeline([('std_scl', StandardScaler()), 
                    ('log_reg', LogisticRegression(solver = 'saga', penalty = 'l1', class_weight = 'balanced', max_iter = 1500, C = 1))])

In [0]:
scaler = StandardScaler()

In [0]:
# For Logistic Regression
k = 5
kf_log = KFold(n_splits = k, shuffle = True, random_state = 101)

In [37]:
train_errors = []
valid_errors = []

for train_index, val_index in kf_log.split(bankdum_final_logreg,target):
    
    #split data
    X_trn, X_val = bankdum_final_logreg.iloc[train_index], bankdum_final_logreg.iloc[val_index]
    y_trn, y_val = target.iloc[train_index], target.iloc[val_index]
    
    #instantiate model -- taking the one with the best hyperparameter according to the randomsearch
    log_reg = LogisticRegression(solver = 'saga', penalty = 'l1', class_weight = 'balanced', max_iter = 1500, C = 1)

    #Scaling our X_train_log and X_val_log
    X_tr_scale = scaler.fit_transform(X_trn)
    X_val_scale = scaler.transform(X_val)


    #calculate error
    train_error, valid_error = calc_metrics(X_tr_scale, y_trn, X_val_scale, y_val, log_reg)
    
    #append to appropriate list
    train_errors.append(train_error)
    valid_errors.append(valid_error)



In [38]:
matrix  = []
for i, j in zip(train_errors, valid_errors):
    matrix.append([i['accuracy'], j['accuracy'], i['roc'], j['roc'], i['f1'], j['f1'], 
                   i['matthew'], j['matthew'], i['logloss'], j['logloss']])

calc_matrix = pd.DataFrame(matrix, columns = ['Train Accuracy', 'Test Accuracy', 'Train ROC AUC', 'Test ROC AUC',
                                             'Train F1 Score', 'Test F1 Score', 'Train Matthews Corr Coef', 
                                              'Test Matthews Corr Coef', 'Train Log Loss', 'Test Log Loss'])
average = []
for i in calc_matrix.columns:
    average.append(calc_matrix[i].mean())
    
average_mat = pd.DataFrame(average).T
average_mat.columns = ['Train Accuracy', 'Test Accuracy', 'Train ROC AUC', 'Test ROC AUC',
                                             'Train F1 Score', 'Test F1 Score', 'Train Matthews Corr Coef', 
                                              'Test Matthews Corr Coef', 'Train Log Loss', 'Test Log Loss']
index_label = []
for i in range(1, len(calc_matrix)+1):
    index_label.append(f'{i} Iteration')
index_label.append('Average')
calculation = pd.concat([calc_matrix, average_mat])
calculation.index = index_label
calculation

Unnamed: 0,Train Accuracy,Test Accuracy,Train ROC AUC,Test ROC AUC,Train F1 Score,Test F1 Score,Train Matthews Corr Coef,Test Matthews Corr Coef,Train Log Loss,Test Log Loss
1 Iteration,0.71736,0.729789,0.755358,0.747579,0.359711,0.370831,0.287386,0.290372,0.558161,0.55187
2 Iteration,0.720971,0.716436,0.757039,0.741462,0.36724,0.339367,0.292202,0.268545,0.556814,0.558462
3 Iteration,0.723005,0.715222,0.750986,0.761866,0.363307,0.357612,0.287342,0.289489,0.560821,0.566603
4 Iteration,0.718552,0.722229,0.75378,0.752876,0.358378,0.372463,0.284758,0.29563,0.560357,0.559527
5 Iteration,0.7306,0.727935,0.753179,0.756245,0.367059,0.374895,0.293631,0.300763,0.561357,0.56266
Average,0.722097,0.722322,0.754068,0.752006,0.363139,0.363034,0.289064,0.28896,0.559502,0.559824


### Preparing for model deployment

In [39]:
## preparing for model deployment -- Logistic Regression

pipe_log.fit(X_trainfin, y_trainfin)

Pipeline(memory=None,
         steps=[('std_scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('log_reg',
                 LogisticRegression(C=1, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1500,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [0]:
predict_test = pipe_log.predict(X_testfin)

In [41]:
f1_score(y_testfin, predict_test)

0.3585868127715851

In [42]:
print(confusion_matrix(y_testfin, predict_test))

[[8013 2965]
 [ 430  949]]


In [0]:
import pickle
filename = 'pipe_logreg_bank.sav';
pickle.dump(pipe_log, open(filename, 'wb'))

In [44]:
## preparing for model deployment -- XGB

xgb.fit(X_train_over, y_train_over)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
predict_testxgb = xgb.predict(X_testfin)

In [46]:
f1_score(y_testfin, predict_testxgb)

0.3401785714285715

In [0]:
import pickle
filename = 'xgb_bank.sav';
pickle.dump(xgb, open(filename, 'wb'))

In [48]:
X_testfin.columns

Index(['Age', 'Euribor 3M', 'Consumer Price Index',
       'Consumer Confidence Index', 'Job_Admin.', 'Job_Blue-collar',
       'Job_Entrepreneur', 'Job_Housemaid', 'Job_Management', 'Job_Retired',
       'Job_Self-employed', 'Job_Services', 'Job_Student', 'Job_Technician',
       'Job_Unemployed', 'Job_Unknown', 'Marital Status_Divorced',
       'Marital Status_Married', 'Marital Status_Single',
       'Marital Status_Unknown', 'Education_Basic 4y', 'Education_Basic 6y',
       'Education_Basic 9y', 'Education_High school', 'Education_Illiterate',
       'Education_Professional course', 'Education_University degree',
       'Education_Unknown'],
      dtype='object')

In [49]:
X_testfin.head()

Unnamed: 0,Age,Euribor 3M,Consumer Price Index,Consumer Confidence Index,Job_Admin.,Job_Blue-collar,Job_Entrepreneur,Job_Housemaid,Job_Management,Job_Retired,Job_Self-employed,Job_Services,Job_Student,Job_Technician,Job_Unemployed,Job_Unknown,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Unknown,Education_Basic 4y,Education_Basic 6y,Education_Basic 9y,Education_High school,Education_Illiterate,Education_Professional course,Education_University degree,Education_Unknown
3669,32,4.859,93.994,-36.4,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
20131,57,4.965,93.444,-36.1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2492,33,4.856,93.994,-36.4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
14088,29,4.962,93.918,-42.7,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
34986,27,1.25,92.893,-46.2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0


In [0]:
from tpot import TPOTClassifier


In [51]:
pip install tpot

Collecting tpot
[?25l  Downloading https://files.pythonhosted.org/packages/ea/9f/813faf5ec7aa95f393a07603abd01fcb925b65ffe95441b25da029a69ff7/TPOT-0.11.1-py3-none-any.whl (75kB)
[K     |████▎                           | 10kB 20.8MB/s eta 0:00:01[K     |████████▋                       | 20kB 1.8MB/s eta 0:00:01[K     |█████████████                   | 30kB 2.6MB/s eta 0:00:01[K     |█████████████████▎              | 40kB 1.7MB/s eta 0:00:01[K     |█████████████████████▋          | 51kB 2.1MB/s eta 0:00:01[K     |██████████████████████████      | 61kB 2.5MB/s eta 0:00:01[K     |██████████████████████████████▎ | 71kB 2.9MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.5MB/s 
Collecting stopit>=1.1.1
  Downloading https://files.pythonhosted.org/packages/35/58/e8bb0b0fb05baf07bbac1450c447d753da65f9701f551dca79823ce15d50/stopit-1.1.2.tar.gz
Collecting update-checker>=0.16
  Downloading https://files.pythonhosted.org/packages/17/c9/ab11855af164d03be0ff4fddd4c4

In [0]:
tpot = TPOTClassifier(subsample = 0.8, verbosity = 2, warm_start=True, early_stop=20, max_time_mins= 60, n_jobs= -2, scoring = 'f1')

In [54]:
tpot.fit(X_trainfin, y_trainfin)

HBox(children=(IntProgress(value=0, description='Optimization Progress', style=ProgressStyle(description_width…

Generation 1 - Current best internal CV score: 0.46147882690168984

60.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GaussianNB(RandomForestClassifier(SelectFromModel(input_matrix, criterion=entropy, max_features=0.5, n_estimators=100, threshold=0.05), bootstrap=True, criterion=gini, max_features=0.8, min_samples_leaf=16, min_samples_split=20, n_estimators=100))


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=20, generations=100,
               max_eval_time_mins=5, max_time_mins=60, memory=None,
               mutation_rate=0.9, n_jobs=-2, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=None, scoring='f1', subsample=0.8, template=None,
               use_dask=False, verbosity=2, warm_start=True)

In [0]:
predict_tpot = tpot.predict(X_testfin)

In [56]:
print(classification_report(y_testfin, predict_tpot))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     10978
           1       0.44      0.52      0.48      1379

    accuracy                           0.87     12357
   macro avg       0.69      0.72      0.70     12357
weighted avg       0.88      0.87      0.88     12357



In [58]:
print(confusion_matrix(y_testfin, predict_tpot))

[[10054   924]
 [  661   718]]


In [0]:
tpot2 = TPOTClassifier(subsample = 0.8, verbosity = 2, warm_start=True, early_stop=20, max_time_mins= 120, n_jobs= -2, scoring = 'f1', cv = 3)

In [61]:
tpot2.fit(X_trainfin, y_trainfin)

HBox(children=(IntProgress(value=0, description='Optimization Progress', style=ProgressStyle(description_width…

Generation 1 - Current best internal CV score: 0.41399084107985096
Generation 2 - Current best internal CV score: 0.4470416700284065
Generation 3 - Current best internal CV score: 0.4470416700284065
Generation 4 - Current best internal CV score: 0.4470416700284065
Generation 5 - Current best internal CV score: 0.4606549258592689
Generation 6 - Current best internal CV score: 0.4606549258592689

120.49 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GaussianNB(PCA(ExtraTreesClassifier(RFE(input_matrix, criterion=gini, max_features=0.3, n_estimators=100, step=0.1), bootstrap=False, criterion=entropy, max_features=0.7500000000000001, min_samples_leaf=9, min_samples_split=20, n_estimators=100), iterated_power=8, svd_solver=randomized))


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=3,
               disable_update_check=False, early_stop=20, generations=100,
               max_eval_time_mins=5, max_time_mins=120, memory=None,
               mutation_rate=0.9, n_jobs=-2, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=None, scoring='f1', subsample=0.8, template=None,
               use_dask=False, verbosity=2, warm_start=True)

In [0]:
test2 = tpot2.predict(X_testfin)

In [63]:
print(classification_report(y_testfin, test2))

              precision    recall  f1-score   support

           0       0.93      0.92      0.92     10978
           1       0.42      0.48      0.45      1379

    accuracy                           0.87     12357
   macro avg       0.67      0.70      0.69     12357
weighted avg       0.88      0.87      0.87     12357



In [64]:
print(confusion_matrix(y_testfin, test2))

[[10045   933]
 [  714   665]]


In [0]:
tpot2.export('tpot_exported_pipeline.py')

In [0]:
# Import file management
from google.colab import files

# Download the pipeline for local use
files.download('tpot_exported_pipeline.py')