# Run and evaluate models

I built this notebook by running models, looking at the output, then going back and changing parameters as needed. When I'm happy with the output, I save it and move on.

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn import metrics

from data_pipeline import run_a_model

## Utilities to load and save good models

In [88]:
# %load load_save.py
# Database load and store functions 
# This file is meant to be %load'ed at the top of the
# various model-run notebooks


%load_ext sql

%config SqlMagic.autopandas = True 

%sql postgresql://localhost/bankcalls
    

import os.path
import pickle

from string import Template

def store_results(m):
    filename = m['name'] + '_' + m['strategy'] + '.pkl'
    dirname = '../data/inter/'
    pathname = dirname + filename
    
    count = 0
    while os.path.isfile(pathname):
        pathname = (dirname + 
                    m['name'] + 
                    '_' + 
                    m['strategy'] + 
                    str(count) +
                    '.pkl'
                   )
        count += 1
                    
    f = open(pathname, 'w+b')
    pickle.dump(m, f)
    f.close()
    
    # all the quotes and brackets seem to confuse %sql so I'm templating
    # the command manually
    sqlt = Template("""INSERT 
        INTO test_results(pathname, accuracy, recall, precision, 
                            f1, auc, cm_00, cm_01, cm_10, cm_11) 
        VALUES  ($pg_path, $accuracy, $recall, $precision, 
                            $f1, $auc, $cm_00, $cm_01, $cm_10, $cm_11);""")
    sqlins = sqlt.substitute(pg_path = "'" + pathname + "'",
                    accuracy = m['accuracy'],
                    recall = m['recall'],
                    precision = m['precision'],
                    f1 = m['f1'],
                    auc = m['auc'],
                    cm_00 = m['cm'][0,0],
                    cm_01 = m['cm'][0,1],
                    cm_10 = m['cm'][1,0],
                    cm_11 = m['cm'][1,1]
                   )
    %sql $sqlins
                    
    return pathname
    
        
def load_results(path):
    f = open(path, 'r+b')
    m = pickle.load(f)
    return m

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


# Choose columns

My three column sets here are: 

- EDA : The columns that looked most promising during EDA.
- Context : The columns providing calendar and economic context only. No customer data.
- All : Throw all the columns in the dataset into the model 


In [9]:

columns_eda = [ 'job', 'contact', 'month', 'poutcome', 
         'cons_price_idx', 'cons_conf_idx',
          'euribor3m', 'success', 'bank_addl_id']

columns_context = [ 'cons_price_idx', 'cons_conf_idx',
                  'euribor3m', 'month', 'day_of_week',
                  'success', 'bank_addl_id']

columns_all = [ 'age', 'job', 'marital', 'education',
               'in_default', 'housing', 'loan', 'contact',
               'month', 'day_of_week', 'campaign',
               'previous', 'poutcome', 'cons_price_idx',
               'cons_conf_idx', 'euribor3m',
               'success', 'bank_addl_id' ]

# Logistic regression

## EDA columns, Random Over Sampling

In [17]:
Cs = np.logspace(0, 3, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                          random_state=33, 
                          max_iter=100
                          )
eda_measures = run_a_model('logi-eda-ros', lr, columns_eda, 'ros', params)


In [19]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-ros_ros.pkl'

In [22]:
Cs = np.logspace(-3, 3, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=33, max_iter=100)
eda_measures = run_a_model('logi-eda-ros-l1', lr, columns_eda, 'ros', params)

In [23]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-ros-l1_ros.pkl'

In [24]:
%sql select * from test_results;

 * postgresql://localhost/bankcalls
2 rows affected.


pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
../data/inter/logi-eda-ros_ros.pkl,0.819028831562974,0.635775862068966,0.338545402381294,0.441823457830197,0.793019090244214,24627,4611,1352,2360
../data/inter/logi-eda-ros-l1_ros.pkl,0.81948406676783,0.635237068965517,0.339182968929804,0.442235558889722,0.793022803453406,24644,4594,1354,2358


## EDA columns, SMOTENC over sampling

In [28]:
Cs = np.logspace(-1, 2, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                          random_state=33, 
                          max_iter=100
                          )
eda_measures = run_a_model('logi-eda-smote', lr, columns_eda, 'smote', params)


In [30]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-smote_smote.pkl'

In [31]:
Cs = np.logspace(-3, 3, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=33, max_iter=100)
eda_measures = run_a_model('logi-eda-l1', lr, columns_eda, 'smote', params)

In [33]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-l1_smote.pkl'

In [34]:
%sql select * from test_results;

 * postgresql://localhost/bankcalls
4 rows affected.


pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
../data/inter/logi-eda-ros_ros.pkl,0.819028831562974,0.635775862068966,0.338545402381294,0.441823457830197,0.793019090244214,24627,4611,1352,2360
../data/inter/logi-eda-ros-l1_ros.pkl,0.81948406676783,0.635237068965517,0.339182968929804,0.442235558889722,0.793022803453406,24644,4594,1354,2358
../data/inter/logi-eda-smote_smote.pkl,0.800667678300455,0.652478448275862,0.314545454545455,0.424465474938661,0.79048115322437,23960,5278,1290,2422
../data/inter/logi-eda-l1_smote.pkl,0.800667678300455,0.652478448275862,0.314545454545455,0.424465474938661,0.790485879964607,23960,5278,1290,2422


At first glance, it does not appear the the slower SMOTENC resampling method added any benefit to the LogisticRegression model.  But, I am impressed at the increase in the "true positive" corner of the confusion matrix. It found those needles in the haystack (higher recall) despite not having the higher f1 value. The area under the curve metric is also very similar. It will be interesting to put these on the 
holdout set at the end. 

## Context columns only

### Random Over Sampling 

In [37]:
Cs = np.logspace(-3, 1, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                          random_state=33, 
                          max_iter=100
                          )
measures = run_a_model('logi-context', lr, columns_context, 'ros', params)


In [39]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-context_ros.pkl'

In [40]:
Cs = np.logspace(-2, 2, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear',
                          penalty='l1',
                          random_state=33, 
                          max_iter=100
                          )
measures = run_a_model('logi-context-l1', lr, columns_context, 'ros', params)

In [42]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-context-l1_ros.pkl'

In [49]:
df = %sql select * from test_results;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
6 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-eda-ros_ros.pkl,0.819,0.6358,0.3385,0.4418,0.793,24627,4611,1352,2360
1,../data/inter/logi-eda-ros-l1_ros.pkl,0.8195,0.6352,0.3392,0.4422,0.793,24644,4594,1354,2358
2,../data/inter/logi-eda-smote_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
3,../data/inter/logi-eda-l1_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
4,../data/inter/logi-context_ros.pkl,0.8118,0.625,0.3254,0.428,0.7844,24429,4809,1392,2320
5,../data/inter/logi-context-l1_ros.pkl,0.8118,0.625,0.3254,0.428,0.7843,24429,4809,1392,2320


For a model with so few columns, it's really not that much of a loss of predictive power! 

## All the columns 

### Random over sampling

In [54]:
Cs = np.logspace(0, 4, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                       random_state=33,
                       max_iter=300)
measures = run_a_model('logi-all-ros', lr, columns_all, 'ros', params)

In [56]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-all-ros_ros.pkl'

In [58]:
Cs = np.logspace(-2, 2, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', 
                        penalty='l1', 
                        random_state=33, 
                        max_iter=300)
measures = run_a_model('logi-all-l1', lr, columns_all, 'ros', params)

In [61]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-all-l1_ros.pkl'

### SMOTE-NC 

In [79]:
Cs = np.logspace(-4,1,12)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs',
                       random_state=33,
                       max_iter=500)
measures = run_a_model('logi-all-smote', lr, columns_all, 'smote', params)

In [80]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-all-smote_smote.pkl'

In [81]:
Cs = np.logspace(-3, 3, 16)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', 
                       penalty='l1',
                       random_state=33,
                       max_iter=400)
measures2 = run_a_model('logi-l1-all-smote', lr, columns_all, 'smote', params)

In [82]:
store_results(measures2)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-l1-all-smote_smote.pkl'

In [85]:
df = %sql select * from test_results order by f1 DESC;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
10 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-eda-ros-l1_ros.pkl,0.8195,0.6352,0.3392,0.4422,0.793,24644,4594,1354,2358
1,../data/inter/logi-eda-ros_ros.pkl,0.819,0.6358,0.3385,0.4418,0.793,24627,4611,1352,2360
2,../data/inter/logi-all-ros_ros.pkl,0.8154,0.6406,0.3336,0.4387,0.7956,24488,4750,1334,2378
3,../data/inter/logi-all-l1_ros.pkl,0.8154,0.6401,0.3336,0.4386,0.7956,24492,4746,1336,2376
4,../data/inter/logi-context-l1_ros.pkl,0.8118,0.625,0.3254,0.428,0.7843,24429,4809,1392,2320
5,../data/inter/logi-context_ros.pkl,0.8118,0.625,0.3254,0.428,0.7844,24429,4809,1392,2320
6,../data/inter/logi-eda-l1_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
7,../data/inter/logi-eda-smote_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
8,../data/inter/logi-l1-all-smote_smote.pkl,0.8012,0.6188,0.3091,0.4123,0.7679,24104,5134,1415,2297
9,../data/inter/logi-all-smote_smote.pkl,0.7926,0.6355,0.3009,0.4084,0.7735,23758,5480,1353,2359


In [84]:
df = %sql select * from test_results order by auc DESC;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
10 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-all-ros_ros.pkl,0.8154,0.6406,0.3336,0.4387,0.7956,24488,4750,1334,2378
1,../data/inter/logi-all-l1_ros.pkl,0.8154,0.6401,0.3336,0.4386,0.7956,24492,4746,1336,2376
2,../data/inter/logi-eda-ros-l1_ros.pkl,0.8195,0.6352,0.3392,0.4422,0.793,24644,4594,1354,2358
3,../data/inter/logi-eda-ros_ros.pkl,0.819,0.6358,0.3385,0.4418,0.793,24627,4611,1352,2360
4,../data/inter/logi-eda-l1_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
5,../data/inter/logi-eda-smote_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
6,../data/inter/logi-context_ros.pkl,0.8118,0.625,0.3254,0.428,0.7844,24429,4809,1392,2320
7,../data/inter/logi-context-l1_ros.pkl,0.8118,0.625,0.3254,0.428,0.7843,24429,4809,1392,2320
8,../data/inter/logi-all-smote_smote.pkl,0.7926,0.6355,0.3009,0.4084,0.7735,23758,5480,1353,2359
9,../data/inter/logi-l1-all-smote_smote.pkl,0.8012,0.6188,0.3091,0.4123,0.7679,24104,5134,1415,2297


In [86]:
df = %sql select * from test_results order by recall DESC;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
10 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-eda-l1_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
1,../data/inter/logi-eda-smote_smote.pkl,0.8007,0.6525,0.3145,0.4245,0.7905,23960,5278,1290,2422
2,../data/inter/logi-all-ros_ros.pkl,0.8154,0.6406,0.3336,0.4387,0.7956,24488,4750,1334,2378
3,../data/inter/logi-all-l1_ros.pkl,0.8154,0.6401,0.3336,0.4386,0.7956,24492,4746,1336,2376
4,../data/inter/logi-eda-ros_ros.pkl,0.819,0.6358,0.3385,0.4418,0.793,24627,4611,1352,2360
5,../data/inter/logi-all-smote_smote.pkl,0.7926,0.6355,0.3009,0.4084,0.7735,23758,5480,1353,2359
6,../data/inter/logi-eda-ros-l1_ros.pkl,0.8195,0.6352,0.3392,0.4422,0.793,24644,4594,1354,2358
7,../data/inter/logi-context-l1_ros.pkl,0.8118,0.625,0.3254,0.428,0.7843,24429,4809,1392,2320
8,../data/inter/logi-context_ros.pkl,0.8118,0.625,0.3254,0.428,0.7844,24429,4809,1392,2320
9,../data/inter/logi-l1-all-smote_smote.pkl,0.8012,0.6188,0.3091,0.4123,0.7679,24104,5134,1415,2297


Remember when looking at the above tables the parameter search was scored on f1.

