# Bank Predictions

# Step 3: Develop

In [1]:
# Importing general packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sp
import sys
sys.path.insert(0, './scripts/')

from scripts.helpers import load_data, get_smotenc, get_upsample, bank_profit, get_downsample
from scripts.Preprocessing import Preprocessing
from scripts.ModelTuner import ModelTunerCV

%matplotlib inline

In [2]:
# Importing all sci-kit learn packages
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

I need set aside a portion of the data to evaluate my final model on. This will be saved in a file called *bank_holdout.csv* in the *holdout_data* directory.

In [3]:
bank_data_full = load_data('./cleaned_data/', 'bank-full.csv', verbose=True, index=0)
bank_data_tuning, bank_data_holdout = train_test_split(bank_data_full, random_state=849)

--------------------------------bank-full shape---------------------------------
                                  (36013, 17)                                   


----------------------------bank-full's column types----------------------------
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int64
dtype: object


---------------------------bank-full first five rows----------------------------
     job  marital    education default  housing    loan       contact month  \
0  other  married     basic.4y      no  not yes      no  not cellular   may   
1  other  married  high.school  not no  not yes      no  n

In [4]:
%%bash
if [ -d "holdout_data" ]; then rm -R "holdout_data"; fi
mkdir "holdout_data"

In [5]:
bank_data_holdout.to_csv('./holdout_data/bank_holdout.csv')
bank_data_tuning.to_csv('./holdout_data/bank_train.csv')

In [6]:
bank_data_tuning.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
19687,admin.,married,university.degree,not no,yes,no,cellular,aug,thu,-1,nonexistent,1.4,93.444,-36.1,4.968,5228.1,0
3702,technician,married,university.degree,no,not yes,no,not cellular,may,fri,-1,nonexistent,1.1,93.994,-36.4,4.859,5191.0,0
4486,technician,married,professional.course,no,not yes,no,not cellular,may,tue,-1,nonexistent,1.1,93.994,-36.4,4.856,5191.0,0
23951,technician,divorced,professional.course,not no,yes,not no,cellular,aug,fri,-1,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
7482,other,single,high.school,no,not yes,no,not cellular,may,fri,-1,nonexistent,1.1,93.994,-36.4,4.864,5191.0,0


In [7]:
clients_train, clients_tune, subscribed_train, subscribed_tune = train_test_split(
    bank_data_tuning.drop('y', axis=1), bank_data_tuning['y'], random_state=536)

I need to make the scoring object to be passed into my hyperparameter search and cross validation score calculations. Sci-kit Learn makes a convenient function if a user wants to make their own scorer, and it is exactly what I need to make my profit function into a viable scorer.

In [8]:
profit_score = make_scorer(bank_profit, needs_proba=True)

## Engineering Features
* Ensure data is ready for modeling
* Create any new features to enhance the model

I've made a few changes to my Preprocessing class to make it more robust and utilize more of sci-kit learn's methods for feature transformations. These changes have made my code more robust and easier to read overall. 

In [9]:
# p_ will stand for processed data
p = Preprocessing(classification=True)

# Preprocessing the normal data
p_clients_train, p_subscribed_train = p.fit_transform(
    clients_train, y=subscribed_train)
p_clients_tune, p_subscribed_tune = p.transform(
    clients_tune, subscribed_tune)
p_clients_cv, p_subscribed_cv = p.fit_transform(bank_data_tuning.drop('y', axis=1), bank_data_tuning['y'])

p_clients_train.head()

Unnamed: 0,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,...,housing_not yes,housing_yes,poutcome_nonexistent,poutcome_not nonexistent,job_admin.,job_blue-collar,job_other,job_technician,loan_no,loan_not no
29029,-0.348419,-1.19341,-0.850611,-1.423553,-1.278385,-0.946804,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
536,-0.348419,0.651892,0.7456,0.882558,0.714621,0.330398,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
21094,-0.348419,0.842785,-0.209695,0.947216,0.776397,0.846003,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
36726,-0.348419,-1.893352,-1.045144,-0.065749,-1.357481,-1.265062,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
29680,-0.348419,-1.19341,-0.850611,-1.423553,-1.278385,-0.946804,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Creating Models
* Creating and tuning models ([Logistic Regression](#Creating-Models,-Logistic-Regression), [Gradient Boosting](#Creating-Models,-Gradient-Boosting), [Random Forest](#Creating-Models,-Random-Forest))

### Creating Models, Baseline

Before anything, I want to have a baseline of how well my classifiers will end up being. To do so, I will be creating a plain LogisticRegression model without changing any of the hyperparameters. This is just a quick and dirty way to create a classifier that might be used in an environment unfamiliar with Data Science.

In [10]:
lr = LogisticRegression()
cross_val_score(lr, p_clients_cv, p_subscribed_cv, scoring=profit_score, cv=5).mean()



0.5859457821853848

A quick note, for this project, I will be utilizing RandomizedSearchCV instead of tuning the models by hand. I want to eventually automate the model tuning process, and this method seemed to be better than the more robust, but incredibly more time consuming, GridSearchCV.

### Creating Models, Logistic Regression

In [11]:
params = {'C': sp.uniform(0.001, 5),
          'solver': ['newton-cg', 'sag', 'lbfgs']
          }
lr_tuner = ModelTunerCV(LogisticRegression(), profit_score, cv=5)

lr_tuner.tune(X=p_clients_cv, 
              y=p_subscribed_cv, 
              param_grid=params, 
              method='random')
print(f"Best Params: {lr_tuner.best_params_}")
print(f"Best Profit Score: {lr_tuner.best_score_}")

Best Params: {'C': 1.413801090996802, 'solver': 'lbfgs'}
Best Profit Score: 0.5872713913239094




### Creating Models, Gradient Boosting

In [12]:
gbc = GradientBoostingClassifier(random_state=985)
cross_val_score(gbc, p_clients_cv, p_subscribed_cv, scoring=profit_score, cv=3).mean()

0.6010280557861455

In [13]:
gbc_param_dist = {'learning_rate': sp.uniform(.0001, .2), # Picks from a range of [.0001, .2001]
    'subsample': sp.uniform(0.5, .5), # Picks from a range of [.5, 1]
    'min_samples_leaf': sp.randint(1, 5),
    'max_depth': sp.randint(2, 15),
    'max_features': sp.uniform(.01, .99)
}

gbc_tuner = ModelTunerCV(gbc, profit_score, cv=3)
gbc_tuner.tune(X=p_clients_cv, 
              y=p_subscribed_cv, 
              param_grid=gbc_param_dist, 
              method='random',
              n_iter=15)
print(f"Best Params: {gbc_tuner.best_params_}")
print(f"Best Profit Score: {gbc_tuner.best_score_}")

Best Params: {'learning_rate': 0.1020804819179213, 'max_depth': 5, 'max_features': 0.21603778511971944, 'min_samples_leaf': 2, 'subsample': 0.7283105544694686}
Best Profit Score: 0.5995901208929921


In [14]:
gbc = GradientBoostingClassifier(random_state=985, learning_rate=.11, subsample=.8, max_depth=5, min_samples_leaf=8)
cross_val_score(gbc, p_clients_cv, p_subscribed_cv, scoring=profit_score, cv=3).mean()

0.5985529068481122

To test how well the randomized cv can select good enough parameters, I manually tuned the hyperparameters for Gradient 
Boosting as well as the randomized cv. The randomized cv took about 3 minutes and tuning manually took me a few hours of mainly waiting and working on other things like updating documentation while waiting for the cross validation scores to be calculated (This may be less significant of a time sink on your machine, but mine is rather old and slow with these amount of calculations). At the end of the day, there was barely any difference in the profit scores between the hand tuned and randomized search. The time gain of the randomized search is definitely worth the potential drop in score due to uncertainty in tuning. At the very least, randomized search will almost certainly always be "good enough" for production. The performance of the hyperparameter tuning might also be due to the choice of model. I'll repeat with Random Forest which will depend heavily on its hyperparameters.

### Creating Models, Random Forest

In [15]:
rf = RandomForestClassifier(random_state=4215, n_estimators=100)
cross_val_score(rf, p_clients_cv, p_subscribed_cv, scoring=profit_score, cv=3).mean()

0.5447568326912399

In [16]:
rf_param_dist = {'max_depth': sp.randint(7, 50),
                 'min_samples_leaf': sp.uniform(.01, .49),
                 'max_features': sp.uniform(.5, .5)
}

rf_tuner = ModelTunerCV(rf, profit_score, cv=5)
rf_tuner.tune(X=p_clients_cv, 
              y=p_subscribed_cv, 
              param_grid=rf_param_dist, 
              method='random',
             n_iter=15)
print(f"Best Params: {rf_tuner.best_params_}")
print(f"Best Profit Score: {rf_tuner.best_score_}")

Best Params: {'max_depth': 27, 'max_features': 0.7162708532243822, 'min_samples_leaf': 0.025569653025819587}
Best Profit Score: 0.5908728886090605


## Testing Models
* Doing a 5-fold cross validation on models

In [17]:
def perform_cv5(model, params, X, y):
    model.set_params(**params)
    return cross_val_score(model, X, y, scoring=profit_score, cv=5).mean()

In [18]:
print(f"Logistic Regression: {perform_cv5(lr, lr_tuner.best_params_, p_clients_cv, p_subscribed_cv)}")
print(f"Gradient Boosting: {perform_cv5(gbc, gbc_tuner.best_params_, p_clients_cv, p_subscribed_cv)}")
print(f"Random Forest: {perform_cv5(rf, rf_tuner.best_params_, p_clients_cv, p_subscribed_cv)}")



Logistic Regression: 0.5857732004055214
Gradient Boosting: 0.5988379987809191
Random Forest: 0.5918078974929023


## Selecting the Best Model
* Selecting the model with the highest score for production

By an incredibly small margin, Gradient Boosting has the best profit score after averaging the 5-fold cross validation scores. The parameters will be listed below to be used in the model in the final steps.

In [19]:
gbc = GradientBoostingClassifier(random_state=985).set_params(**gbc_tuner.best_params_)
gbc.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1020804819179213,
 'loss': 'deviance',
 'max_depth': 5,
 'max_features': 0.21603778511971944,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 985,
 'subsample': 0.7283105544694686,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}