# Run and evaluate Random Forest models

I built this notebook by running models, looking at the output, then going back and changing parameters as needed. When I'm happy with the output, I save it and move on.

In [48]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from data_pipeline import run_a_model

## Utilities to load and save good models

In [49]:
# %load load_save.py
# Database load and store functions 
# This file is meant to be %load'ed at the top of the
# various model-run notebooks


%load_ext sql

%config SqlMagic.autopandas = True 

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [50]:
%sql postgresql://localhost/bankcalls
    

import os.path
import pickle

from string import Template

In [51]:
def store_results(m):
    filename = m['name'] + '_' + m['strategy'] + '.pkl'
    dirname = '../data/inter/'
    pathname = dirname + filename
    
    count = 0
    while os.path.isfile(pathname):
        pathname = (dirname + 
                    m['name'] + 
                    '_' + 
                    m['strategy'] + 
                    str(count) +
                    '.pkl'
                   )
        count += 1
                    
    f = open(pathname, 'w+b')
    pickle.dump(m, f)
    f.close()
    
    # all the quotes and brackets seem to confuse %sql so I'm templating
    # the command manually
    sqlt = Template("""INSERT 
        INTO test_results(pathname, accuracy, recall, precision, 
                            f1, auc, cm_00, cm_01, cm_10, cm_11) 
        VALUES  ($pg_path, $accuracy, $recall, $precision, 
                            $f1, $auc, $cm_00, $cm_01, $cm_10, $cm_11);""")
    sqlins = sqlt.substitute(pg_path = "'" + pathname + "'",
                    accuracy = m['accuracy'],
                    recall = m['recall'],
                    precision = m['precision'],
                    f1 = m['f1'],
                    auc = m['auc'],
                    cm_00 = m['cm'][0,0],
                    cm_01 = m['cm'][0,1],
                    cm_10 = m['cm'][1,0],
                    cm_11 = m['cm'][1,1]
                   )
    %sql $sqlins
                    
    return pathname
    
        
def load_results(path):
    f = open(path, 'r+b')
    m = pickle.load(f)
    return m

# Choose columns

My three column sets here are: 

- EDA : The columns that looked most promising during EDA.
- Context : The columns providing calendar and economic context only. No customer data.
- All : Throw all the columns in the dataset into the model 


In [52]:
# --
#  These are not the same as the ones in the Logistic Regression notebook -- I've added
#  back in the highly correlated columns to 'context' and 'all'
# --
columns_eda = [ 'job', 'contact', 'month', 'poutcome', 
         'cons_price_idx', 'cons_conf_idx',
          'euribor3m', 'success', 'bank_addl_id']

columns_context = [ 'cons_price_idx', 'cons_conf_idx',
                  'euribor3m', 'nr_employed', 'emp_var_rate',
                   'month', 'day_of_week',
                  'success', 'bank_addl_id']

columns_all = [ 'age', 'job', 'marital', 'education',
               'in_default', 'housing', 'loan', 'contact',
               'month', 'day_of_week', 'campaign',
               'previous', 'poutcome', 'cons_price_idx',
               'cons_conf_idx', 'euribor3m','nr_employed', 'emp_var_rate',
               'success', 'bank_addl_id' ]

## EDA columns

In [53]:
n_estimators=[200,300,400]
oob_score=[True] 
max_features = ['auto']

params = dict(randomforestclassifier__n_estimators=n_estimators,
              randomforestclassifier__oob_score=oob_score,
              randomforestclassifier__max_features=max_features
             )

rfc = RandomForestClassifier(n_jobs=-1, random_state=212)

# using downsampling to tune initial ranges of the RF settings since there
# are more knobs this time
measures = run_a_model('rf_eda', rfc, columns_eda, 'down', params)


In [54]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/rf_eda_down.pkl'

In [55]:
rfc = RandomForestClassifier(n_jobs=-1, random_state=212)
measures = run_a_model('rf_eda', rfc, columns_eda, 'ros', params)

In [56]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/rf_eda_ros.pkl'

In [57]:
# SMOTE takes a long time, so just using best params from ros
rfc = RandomForestClassifier(n_jobs=-1, random_state=212, n_estimators=300, oob_score=True, max_features='auto')
measures = run_a_model('rf_eda', rfc, columns_eda, 'smote', None)

In [58]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/rf_eda_smote.pkl'

## Context columns

In [59]:
n_estimators=[200,300,400]
oob_score=[True] 
max_features = ['auto']

params = dict(randomforestclassifier__n_estimators=n_estimators,
              randomforestclassifier__oob_score=oob_score,
              randomforestclassifier__max_features=max_features
             )

rfc = RandomForestClassifier(n_jobs=-1, random_state=212)

measures = run_a_model('rf_context', rfc, columns_context, 'ros', params)


In [60]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/rf_context_ros.pkl'

In [63]:
# All columns

In [64]:
n_estimators=[200,300,400]
oob_score=[True] 
max_features = ['auto']

params = dict(randomforestclassifier__n_estimators=n_estimators,
              randomforestclassifier__oob_score=oob_score,
              randomforestclassifier__max_features=max_features
             )

rfc = RandomForestClassifier(n_jobs=-1, random_state=212)

measures = run_a_model('rf_all', rfc, columns_all, 'ros', params)


In [66]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/rf_all_ros.pkl'

In [69]:
n_estimators=[400, 500]
params = dict(randomforestclassifier__n_estimators=n_estimators,
              randomforestclassifier__oob_score=oob_score,
              randomforestclassifier__max_features=max_features
             )
rfc = RandomForestClassifier(n_jobs=-1, random_state=212)

measures = run_a_model('rf_all', rfc, columns_all, 'smote', params)

In [70]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/rf_all_smote.pkl'

In [71]:
%sql select * from test_results order by f1 desc limit 10; 

 * postgresql://localhost/bankcalls
10 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/rf_all_ros.pkl,0.99393,1.0,0.948875,0.973767,0.999847,29038,200,0,3712
1,../data/inter/rf_all_smote.pkl,0.994052,0.972522,0.974622,0.973571,0.99943,29144,94,102,3610
2,../data/inter/rf_eda_ros.pkl,0.865615,0.721713,0.44106,0.547517,0.887394,25843,3395,1033,2679
3,../data/inter/rf_context_ros.pkl,0.861851,0.62042,0.422879,0.502948,0.817078,26095,3143,1409,2303
4,../data/inter/rf_eda_down.pkl,0.806798,0.72306,0.33458,0.457474,0.846988,23900,5338,1028,2684
5,../data/inter/logi-all-ros_ros.pkl,0.812079,0.646282,0.329624,0.436579,0.795784,24359,4879,1313,2399
6,../data/inter/logi-eda-ros_ros.pkl,0.805766,0.651131,0.321324,0.430301,0.794002,24133,5105,1295,2417
7,../data/inter/logi-eda-smote_smote.pkl,0.802883,0.649784,0.317076,0.426186,0.790426,24043,5195,1300,2412
8,../data/inter/rf_eda_smote.pkl,0.761487,0.724138,0.282264,0.406196,0.776069,22403,6835,1024,2688
9,../data/inter/logi-eda-l1_ros.pkl,0.721093,0.719558,0.246858,0.367603,0.746322,21089,8149,1041,2671


Random forests beat the logistic regression values. They look so good, they might be overfitting? 