# Run and evaluate Logistic Regression models

I built this notebook by running models, looking at the output, then going back and changing parameters as needed. When I'm happy with the output, I save it and move on.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn import metrics

from data_pipeline import run_a_model

## Utilities to load and save good models

In [2]:
# %load load_save.py
# Database load and store functions 
# This file is meant to be %load'ed at the top of the
# various model-run notebooks


%load_ext sql

%config SqlMagic.autopandas = True 

%sql postgresql://localhost/bankcalls
    

import os.path
import pickle

from string import Template

def store_results(m):
    filename = m['name'] + '_' + m['strategy'] + '.pkl'
    dirname = '../data/inter/'
    pathname = dirname + filename
    
    count = 0
    while os.path.isfile(pathname):
        pathname = (dirname + 
                    m['name'] + 
                    '_' + 
                    m['strategy'] + 
                    str(count) +
                    '.pkl'
                   )
        count += 1
                    
    f = open(pathname, 'w+b')
    pickle.dump(m, f)
    f.close()
    
    # all the quotes and brackets seem to confuse %sql so I'm templating
    # the command manually
    sqlt = Template("""INSERT 
        INTO test_results(pathname, accuracy, recall, precision, 
                            f1, auc, cm_00, cm_01, cm_10, cm_11) 
        VALUES  ($pg_path, $accuracy, $recall, $precision, 
                            $f1, $auc, $cm_00, $cm_01, $cm_10, $cm_11);""")
    sqlins = sqlt.substitute(pg_path = "'" + pathname + "'",
                    accuracy = m['accuracy'],
                    recall = m['recall'],
                    precision = m['precision'],
                    f1 = m['f1'],
                    auc = m['auc'],
                    cm_00 = m['cm'][0,0],
                    cm_01 = m['cm'][0,1],
                    cm_10 = m['cm'][1,0],
                    cm_11 = m['cm'][1,1]
                   )
    %sql $sqlins
                    
    return pathname
    
        
def load_results(path):
    f = open(path, 'r+b')
    m = pickle.load(f)
    return m

# Choose columns

My three column sets here are: 

- EDA : The columns that looked most promising during EDA.
- Context : The columns providing calendar and economic context only. No customer data.
- All : Throw all the columns in the dataset into the model 


In [3]:

columns_eda = [ 'job', 'contact', 'month', 'poutcome', 
         'cons_price_idx', 'cons_conf_idx',
          'euribor3m', 'success', 'bank_addl_id']

columns_context = [ 'cons_price_idx', 'cons_conf_idx',
                  'euribor3m', 'month', 'day_of_week',
                  'success', 'bank_addl_id']

columns_all = [ 'age', 'job', 'marital', 'education',
               'in_default', 'housing', 'loan', 'contact',
               'month', 'day_of_week', 'campaign',
               'previous', 'poutcome', 'cons_price_idx',
               'cons_conf_idx', 'euribor3m',
               'success', 'bank_addl_id' ]

# Logistic regression

## EDA columns, Random Over Sampling

In [4]:
Cs = np.logspace(-1, 2, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                          random_state=33, 
                          max_iter=100
                          )
eda_measures = run_a_model('logi-eda-ros', lr, columns_eda, 'ros', params)


In [5]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-ros_ros.pkl'

In [6]:
Cs = np.logspace(-5, 0, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=33, max_iter=100)
eda_measures = run_a_model('logi-eda-l1', lr, columns_eda, 'ros', params)

In [7]:
store_results(eda_measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-l1_ros.pkl'

In [8]:
%sql select * from test_results;

 * postgresql://localhost/bankcalls
2 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-eda-ros_ros.pkl,0.805766,0.651131,0.321324,0.430301,0.794002,24133,5105,1295,2417
1,../data/inter/logi-eda-l1_ros.pkl,0.721093,0.719558,0.246858,0.367603,0.746322,21089,8149,1041,2671


It's interesting to see the trade offs.. The L1 penalty model labeled 3000 more items "1", but only 200 of them were correct labels. 

## EDA columns, SMOTENC over sampling

In [9]:
Cs = np.logspace(1, 3, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                          random_state=33, 
                          max_iter=200
                          )
measures3 = run_a_model('logi-eda-smote', lr, columns_eda, 'smote', params)


In [10]:
store_results(measures3) 

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-smote_smote.pkl'

In [11]:
Cs = np.logspace(-5, 0, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=33, max_iter=100)
measures4 = run_a_model('logi-eda-l1', lr, columns_eda, 'smote', params)

In [12]:
store_results(measures4)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-eda-l1_smote.pkl'

In [13]:
%sql select * from test_results;

 * postgresql://localhost/bankcalls
4 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-eda-ros_ros.pkl,0.805766,0.651131,0.321324,0.430301,0.794002,24133,5105,1295,2417
1,../data/inter/logi-eda-l1_ros.pkl,0.721093,0.719558,0.246858,0.367603,0.746322,21089,8149,1041,2671
2,../data/inter/logi-eda-smote_smote.pkl,0.802883,0.649784,0.317076,0.426186,0.790426,24043,5195,1300,2412
3,../data/inter/logi-eda-l1_smote.pkl,0.721093,0.719558,0.246858,0.367603,0.746138,21089,8149,1041,2671


At first glance, it does not appear the the slower SMOTENC resampling method added any benefit to the LogisticRegression model. 

## Context columns only

### Random Over Sampling 

In [14]:
Cs = np.logspace(-6, -3, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                          random_state=33, 
                          max_iter=100
                          )
measures = run_a_model('logi-context', lr, columns_context, 'ros', params)


In [15]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-context_ros.pkl'

In [16]:
Cs = np.logspace(-5, -2, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear',
                          penalty='l1',
                          random_state=33, 
                          max_iter=100
                          )
measures = run_a_model('logi-context-l1', lr, columns_context, 'ros', params)

In [17]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-context-l1_ros.pkl'

In [18]:
df = %sql select * from test_results;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
6 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-eda-ros_ros.pkl,0.8058,0.6511,0.3213,0.4303,0.794,24133,5105,1295,2417
1,../data/inter/logi-eda-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7463,21089,8149,1041,2671
2,../data/inter/logi-eda-smote_smote.pkl,0.8029,0.6498,0.3171,0.4262,0.7904,24043,5195,1300,2412
3,../data/inter/logi-eda-l1_smote.pkl,0.7211,0.7196,0.2469,0.3676,0.7461,21089,8149,1041,2671
4,../data/inter/logi-context_ros.pkl,0.7214,0.7155,0.2464,0.3665,0.7294,21113,8125,1056,2656
5,../data/inter/logi-context-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671


Are all of my L1 models converging to the same thing? 
I don't see anything here that says this is so great I should try SMOTE, too. 

## All the columns 

### Random over sampling

In [19]:
Cs = np.logspace(0, 4, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs', 
                       random_state=33,
                       max_iter=300)
measures = run_a_model('logi-all-ros', lr, columns_all, 'ros', params)

In [20]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-all-ros_ros.pkl'

In [21]:
Cs = np.logspace(-4, 1, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='liblinear', 
                        penalty='l1', 
                        random_state=33, 
                        max_iter=300)
measures = run_a_model('logi-all-l1', lr, columns_all, 'ros', params)

In [22]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-all-l1_ros.pkl'

In [23]:
df = %sql select * from test_results order by f1 desc;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
8 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-all-ros_ros.pkl,0.8121,0.6463,0.3296,0.4366,0.7958,24359,4879,1313,2399
1,../data/inter/logi-eda-ros_ros.pkl,0.8058,0.6511,0.3213,0.4303,0.794,24133,5105,1295,2417
2,../data/inter/logi-eda-smote_smote.pkl,0.8029,0.6498,0.3171,0.4262,0.7904,24043,5195,1300,2412
3,../data/inter/logi-context-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
4,../data/inter/logi-all-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
5,../data/inter/logi-eda-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7463,21089,8149,1041,2671
6,../data/inter/logi-eda-l1_smote.pkl,0.7211,0.7196,0.2469,0.3676,0.7461,21089,8149,1041,2671
7,../data/inter/logi-context_ros.pkl,0.7214,0.7155,0.2464,0.3665,0.7294,21113,8125,1056,2656


L1 penalties don't seem to result in any model improvement. Dropping them for the longer-running smote case.

### SMOTE-NC 

In [24]:
Cs = np.logspace(-6, -2, 10)
params = dict(logisticregression__C=Cs)

lr = LogisticRegression(solver='lbfgs',
                       random_state=33,
                       max_iter=500)
measures = run_a_model('logi-all', lr, columns_all, 'smote', params)

In [26]:
store_results(measures)

 * postgresql://localhost/bankcalls
1 rows affected.


'../data/inter/logi-all_smote.pkl'

In [27]:
df = %sql select * from test_results order by f1 DESC;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
9 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-all-ros_ros.pkl,0.8121,0.6463,0.3296,0.4366,0.7958,24359,4879,1313,2399
1,../data/inter/logi-eda-ros_ros.pkl,0.8058,0.6511,0.3213,0.4303,0.794,24133,5105,1295,2417
2,../data/inter/logi-eda-smote_smote.pkl,0.8029,0.6498,0.3171,0.4262,0.7904,24043,5195,1300,2412
3,../data/inter/logi-context-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
4,../data/inter/logi-eda-l1_smote.pkl,0.7211,0.7196,0.2469,0.3676,0.7461,21089,8149,1041,2671
5,../data/inter/logi-eda-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7463,21089,8149,1041,2671
6,../data/inter/logi-all-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
7,../data/inter/logi-context_ros.pkl,0.7214,0.7155,0.2464,0.3665,0.7294,21113,8125,1056,2656
8,../data/inter/logi-all_smote.pkl,0.7077,0.7101,0.2355,0.3537,0.7465,20682,8556,1076,2636


In [28]:
df = %sql select * from test_results order by auc DESC;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
9 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-all-ros_ros.pkl,0.8121,0.6463,0.3296,0.4366,0.7958,24359,4879,1313,2399
1,../data/inter/logi-eda-ros_ros.pkl,0.8058,0.6511,0.3213,0.4303,0.794,24133,5105,1295,2417
2,../data/inter/logi-eda-smote_smote.pkl,0.8029,0.6498,0.3171,0.4262,0.7904,24043,5195,1300,2412
3,../data/inter/logi-context-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
4,../data/inter/logi-all-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
5,../data/inter/logi-all_smote.pkl,0.7077,0.7101,0.2355,0.3537,0.7465,20682,8556,1076,2636
6,../data/inter/logi-eda-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7463,21089,8149,1041,2671
7,../data/inter/logi-eda-l1_smote.pkl,0.7211,0.7196,0.2469,0.3676,0.7461,21089,8149,1041,2671
8,../data/inter/logi-context_ros.pkl,0.7214,0.7155,0.2464,0.3665,0.7294,21113,8125,1056,2656


In [29]:
df = %sql select * from test_results order by recall DESC;
df.round(decimals=4)

 * postgresql://localhost/bankcalls
9 rows affected.


Unnamed: 0,pathname,accuracy,recall,precision,f1,auc,cm_00,cm_01,cm_10,cm_11
0,../data/inter/logi-context-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
1,../data/inter/logi-eda-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7463,21089,8149,1041,2671
2,../data/inter/logi-all-l1_ros.pkl,0.7211,0.7196,0.2469,0.3676,0.7489,21089,8149,1041,2671
3,../data/inter/logi-eda-l1_smote.pkl,0.7211,0.7196,0.2469,0.3676,0.7461,21089,8149,1041,2671
4,../data/inter/logi-context_ros.pkl,0.7214,0.7155,0.2464,0.3665,0.7294,21113,8125,1056,2656
5,../data/inter/logi-all_smote.pkl,0.7077,0.7101,0.2355,0.3537,0.7465,20682,8556,1076,2636
6,../data/inter/logi-eda-ros_ros.pkl,0.8058,0.6511,0.3213,0.4303,0.794,24133,5105,1295,2417
7,../data/inter/logi-eda-smote_smote.pkl,0.8029,0.6498,0.3171,0.4262,0.7904,24043,5195,1300,2412
8,../data/inter/logi-all-ros_ros.pkl,0.8121,0.6463,0.3296,0.4366,0.7958,24359,4879,1313,2399


Remember when looking at the above tables the parameter search was scored on f1.

