# Run and evaluate GradientBoostingClassifier models

I built this notebook by running models, looking at the output, then going back and changing parameters as needed. When I'm happy with the output, I save it and move on.

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

from data_pipeline import run_a_model

## Utilities to load and save good models

In [2]:
# %load load_save.py
# Database load and store functions 
# This file is meant to be %load'ed at the top of the
# various model-run notebooks


%load_ext sql

%config SqlMagic.autopandas = True 

In [3]:
%sql postgresql://localhost/bankcalls
    

import os.path
import pickle

from string import Template

In [4]:
def store_results(m):
    filename = m['name'] + '_' + m['strategy'] + '.pkl'
    dirname = '../data/inter/'
    pathname = dirname + filename
    
    count = 0
    while os.path.isfile(pathname):
        pathname = (dirname + 
                    m['name'] + 
                    '_' + 
                    m['strategy'] + 
                    str(count) +
                    '.pkl'
                   )
        count += 1
                    
    f = open(pathname, 'w+b')
    pickle.dump(m, f)
    f.close()
    
    # all the quotes and brackets seem to confuse %sql so I'm templating
    # the command manually
    sqlt = Template("""INSERT 
        INTO test_results(pathname, accuracy, recall, precision, 
                            f1, auc, cm_00, cm_01, cm_10, cm_11) 
        VALUES  ($pg_path, $accuracy, $recall, $precision, 
                            $f1, $auc, $cm_00, $cm_01, $cm_10, $cm_11);""")
    sqlins = sqlt.substitute(pg_path = "'" + pathname + "'",
                    accuracy = m['accuracy'],
                    recall = m['recall'],
                    precision = m['precision'],
                    f1 = m['f1'],
                    auc = m['auc'],
                    cm_00 = m['cm'][0,0],
                    cm_01 = m['cm'][0,1],
                    cm_10 = m['cm'][1,0],
                    cm_11 = m['cm'][1,1]
                   )
    %sql $sqlins
                    
    return pathname
    
        
def load_results(path):
    f = open(path, 'r+b')
    m = pickle.load(f)
    return m

# Choose columns

My three column sets here are: 

- EDA : The columns that looked most promising during EDA.
- Context : The columns providing calendar and economic context only. No customer data.
- All : Throw all the columns in the dataset into the model 


In [5]:
# --
#  These are not the same as the ones in the Logistic Regression notebook -- I've added
#  back in the highly correlated columns to 'context' and 'all'
# --
columns_eda = [ 'job', 'contact', 'month', 'poutcome', 
         'cons_price_idx', 'cons_conf_idx',
          'euribor3m', 'success', 'bank_addl_id']

columns_context = [ 'cons_price_idx', 'cons_conf_idx',
                  'euribor3m', 'nr_employed', 'emp_var_rate',
                   'month', 'day_of_week',
                  'success', 'bank_addl_id']

columns_all = [ 'age', 'job', 'marital', 'education',
               'in_default', 'housing', 'loan', 'contact',
               'month', 'day_of_week', 'campaign',
               'previous', 'poutcome', 'cons_price_idx',
               'cons_conf_idx', 'euribor3m','nr_employed', 'emp_var_rate',
               'success', 'bank_addl_id' ]

## Downsampling 
Worked with downsampling to have fewer rows while playing around with good values. 

In [6]:
n_estimators=[400, 600, 800, 1000]
subsample=[0.97, 1.0]
max_features = ['auto']

params = dict(gradientboostingclassifier__n_estimators=n_estimators,
              gradientboostingclassifier__subsample=subsample,
              gradientboostingclassifier__max_features=max_features
             )

clf = GradientBoostingClassifier( random_state=212)

# downsampling gives a smaller dataset for initial tuning
measures = run_a_model('gb_eda', clf, columns_eda, 'down', params)


In [7]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/gb_eda_down.pkl'

In [8]:
n_estimators=[200, 400, 600]
subsample=[0.97, 0.99]
max_features = ['auto']
max_depth=[3,4,5]

params = dict(gradientboostingclassifier__n_estimators=n_estimators,
              gradientboostingclassifier__subsample=subsample,
              gradientboostingclassifier__max_features=max_features,
              gradientboostingclassifier__max_depth=max_depth
             )
clf = GradientBoostingClassifier( random_state=212)
eda_measures = run_a_model('gb_eda', clf, columns_eda, 'ros', params)

clf = GradientBoostingClassifier( random_state=212)
all_measures = run_a_model('gb_all', clf, columns_all, 'ros', params)

clf = GradientBoostingClassifier( random_state=212)
con_measures = run_a_model('gb_context', clf, columns_context, 'ros', params)

In [9]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/gb_eda_ros.pkl'

In [10]:
store_results(all_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/gb_all_ros.pkl'

In [11]:
store_results(con_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/gb_context_ros.pkl'

# SMOTE 

We're just going to try one stab in the dark here since we are running out of time

In [15]:
n_estimators=[500]
subsample=[0.98]
max_features = ['auto']
max_depth=[3]

params = dict(gradientboostingclassifier__n_estimators=n_estimators,
              gradientboostingclassifier__subsample=subsample,
              gradientboostingclassifier__max_features=max_features,
              gradientboostingclassifier__max_depth=max_depth
             )
clf = GradientBoostingClassifier( random_state=212)
eda_measures = run_a_model('gb_eda', clf, columns_eda, 'smote', params)

clf = GradientBoostingClassifier( random_state=212)
all_measures = run_a_model('gb_all', clf, columns_all, 'smote', params)

clf = GradientBoostingClassifier( random_state=212)
con_measures = run_a_model('gb_context', clf, columns_context, 'smote', params)

In [16]:
store_results(eda_measures)
store_results(all_measures)
store_results(con_measures)

 * postgresql://localhost/bankcalls
1 rows affected.
 * postgresql://localhost/bankcalls
1 rows affected.
 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/gb_context_smote.pkl'

In [18]:
%sql select * from test_results order by f1 desc; 

 * postgresql://localhost/bankcalls
24 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/rf_all_ros.pkl,0.99393,1.0,0.948875,0.973767,0.999847,29038,200,0,3712
1,../data/inter/rf_all_smote.pkl,0.994052,0.972522,0.974622,0.973571,0.99943,29144,94,102,3610
2,../data/inter/rf_all10_ros.pkl,0.880455,0.786099,0.48128,0.597033,0.94119,26093,3145,794,2918
3,../data/inter/rf_all10_smote.pkl,0.888801,0.608836,0.505367,0.552297,0.876116,27026,2212,1452,2260
4,../data/inter/rf_eda_ros.pkl,0.865615,0.721713,0.44106,0.547517,0.887394,25843,3395,1033,2679
5,../data/inter/gb_context_ros.pkl,0.862246,0.619612,0.423807,0.503337,0.817025,26111,3127,1412,2300
6,../data/inter/rf_context_ros.pkl,0.861851,0.62042,0.422879,0.502948,0.817078,26095,3143,1409,2303
7,../data/inter/gb_all_smote.pkl,0.893293,0.478987,0.529167,0.502828,0.812581,27656,1582,1934,1778
8,../data/inter/gb_all_ros.pkl,0.842155,0.694235,0.387927,0.497731,0.86218,25172,4066,1135,2577
9,../data/inter/gb_eda_smote.pkl,0.853232,0.633621,0.403569,0.493082,0.823148,25762,3476,1360,2352
