In [1]:
import pandas as pd
import numpy as np

In [2]:
%load_ext sql
%sql postgresql://localhost/bankcalls
%config SqlMagic.autopandas =  True

In [3]:
# sklearn stuff goes here
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Load the columns that looked most promising in our EDA

In [4]:
%%sql  df << 
/*--  initial predictors looked interesting in the EDA notebook  --*/
SELECT 
/*-- categorical predictors --*/
       job, 
       contact, 
       month, 
       poutcome, 
        
/*-- integer predictors     --*/
       previous, 
        
/*-- float predictors       --*/
       cons_price_idx, 
       cons_conf_idx, 
       euribor3m,
    
/*-- target column          --*/
       success
FROM  
       bank_addl;

 * postgresql://localhost/bankcalls
41188 rows affected.
Returning data to local variable df


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 9 columns):
job               41188 non-null object
contact           41188 non-null object
month             41188 non-null object
poutcome          41188 non-null object
previous          41188 non-null int64
cons_price_idx    41188 non-null float64
cons_conf_idx     41188 non-null float64
euribor3m         41188 non-null float64
success           41188 non-null object
dtypes: float64(3), int64(1), object(5)
memory usage: 2.8+ MB


## We can save some memory by hashing all the categorical predictors


In [6]:
df['job']      = df[     'job'].astype('category')
df['contact']  = df[ 'contact'].astype('category')
df['month']    = df[   'month'].astype('category')
df['poutcome'] = df['poutcome'].astype('category')

### For the target column, change explicitly to 0 or 1

In [7]:
bool_index    = df.query("success == 'yes'").index
df['success'] = 0
df.loc[bool_index, 'success'] = 1

In [8]:
df.success.value_counts()

0    36548
1     4640
Name: success, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 9 columns):
job               41188 non-null category
contact           41188 non-null category
month             41188 non-null category
poutcome          41188 non-null category
previous          41188 non-null int64
cons_price_idx    41188 non-null float64
cons_conf_idx     41188 non-null float64
euribor3m         41188 non-null float64
success           41188 non-null int64
dtypes: category(4), float64(3), int64(2)
memory usage: 1.7 MB


# Split into train and test data

In [10]:
y = df['success']
X = df.drop(columns='success')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=33)

# Build a model 

## Preprocessing the inputs

In [12]:
numerical = [ 'previous', 'cons_price_idx', 'cons_conf_idx', 'euribor3m' ]
categorical = [ 'job', 'contact', 'month', 'poutcome' ]

In [13]:
standard = StandardScaler()
X_train_scaled = standard.fit_transform(X_train[numerical])
X_test_scaled  = standard.transform(X_test[numerical])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
X_train_dummies = pd.get_dummies(X_train[categorical], drop_first=True)
X_test_dummies =  pd.get_dummies(X_test[categorical], drop_first=True)

In [15]:
X_train_onehot = X_train_dummies.values
X_test_onehot =  X_test_dummies.values

In [16]:
X_train_prepped = np.concatenate([X_train_scaled, X_train_onehot], axis = 1)
X_test_prepped  = np.concatenate([X_test_scaled, X_test_onehot], axis = 1)

coef_names = numerical + list(X_train_dummies.columns)

# Run a model

In [17]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train_prepped, y_train)
lr.score(X_train_prepped, y_train)

0.8997572078907435

In [18]:
y_predict = lr.predict(X_test_prepped)
lr.score(X_test_prepped, y_test)

0.8979121145909201

In [19]:
lr.intercept_

array([-2.52929402])

In [20]:
assert(len(coef_names) == len(lr.coef_.T))
pd.DataFrame(zip(coef_names, lr.coef_.T))

Unnamed: 0,0,1
0,previous,[0.025323235975612483]
1,cons_price_idx,[0.30669929041514327]
2,cons_conf_idx,[0.2200926275406222]
3,euribor3m,[-0.9881628316893285]
4,job_blue-collar,[-0.2575095677553163]
5,job_entrepreneur,[-0.04549779427141516]
6,job_housemaid,[-0.10351534962557206]
7,job_management,[-0.033843608116149136]
8,job_retired,[0.2476239658298761]
9,job_self-employed,[-0.1549714863215562]


In [21]:
probs = pd.DataFrame(lr.predict_proba(X_test_prepped))
probs.describe()

Unnamed: 0,0,1
count,8238.0,8238.0
mean,0.886248,0.113752
std,0.145738,0.145738
min,0.095691,0.023644
25%,0.895184,0.044145
50%,0.942883,0.057117
75%,0.955855,0.104816
max,0.976356,0.904309


# Analyze metrics 

Due to the unbalanced classes, just predicting zeroes should be a very good prediction, so that will be the 
model to beat for the minimum viable project.

## Accuracy

In [22]:
just_guess_zeroes = np.zeros(len(y_test))

In [23]:
metrics.accuracy_score(y_predict, y_test)

0.8979121145909201

In [24]:
metrics.accuracy_score(just_guess_zeroes, y_test)

0.8852876911871813

## Confusion matrix

In [25]:
cm = metrics.confusion_matrix(y_test, y_predict)
pd.DataFrame(index=['actual_neg', 'actual_pos'], columns=['predict_neg', 'predict_pos'], data=cm)

Unnamed: 0,predict_neg,predict_pos
actual_neg,7179,114
actual_pos,727,218


In [26]:
cm = metrics.confusion_matrix(y_test, just_guess_zeroes)
pd.DataFrame(index=['actual_neg', 'actual_pos'], columns=['zero_neg', 'zero_pos'], data=cm)

Unnamed: 0,zero_neg,zero_pos
actual_neg,7293,0
actual_pos,945,0


## Classification Report

In [27]:
print(metrics.classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7293
           1       0.66      0.23      0.34       945

   micro avg       0.90      0.90      0.90      8238
   macro avg       0.78      0.61      0.64      8238
weighted avg       0.88      0.90      0.88      8238



In [28]:
print(metrics.classification_report(y_test, just_guess_zeroes))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      7293
           1       0.00      0.00      0.00       945

   micro avg       0.89      0.89      0.89      8238
   macro avg       0.44      0.50      0.47      8238
weighted avg       0.78      0.89      0.83      8238



  'precision', 'predicted', average, warn_for)
