## Machine Learning and Deep Learning with more features

In this file, I make more prediction attempts with more features. Specifically, I apply the "other" and "econ" features in predictions.

In [2]:
import os
import pandas as pd
import math
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense

In [3]:
# set up the work directory
cwd = os.getcwd()
os.chdir(cwd)

df = pd.read_csv('bank-additional\\bank-additional.csv', delimiter = ";")

One of these features pdays might need to be cleaned. First, we find out all unique values of this feature.

In [4]:
df.pdays.unique()

array([999,  12,   3,   6,   5,   2,  10,  11,   7,   1,  18,   4,  15,
         0,  16,   9,  19,  17,  13,  21,  14], dtype=int64)

In [7]:
# pdays_1: day_0: no contact before; day_1: previous contact <= 10 days; day_2: previous contact > 10 days
df['pdays_1'] = np.where(df.pdays == 999, 'day_0',
                        np.where(df.pdays <= 10, 'day_1', 'day_2'))
df.drop('pdays', axis = 1, inplace = True)

In [9]:
all_features = df.columns

# the features can be categorized following subgroups:
# client-specific, other and economics variables
client = all_features[0:11]
other = all_features[11:15]
econ = all_features[16:20]

# manually input the property of each feature: categorical/numeric variable?
num_features = ['age', 'duration', 'campaign']
num_features.extend(econ)

# save feature property in a dict
prop = {}
for i in all_features:
    if i in num_features:
        prop[i] = 'num'
    else:
        prop[i] = 'cat'
        
prop

{'age': 'num',
 'job': 'cat',
 'marital': 'cat',
 'education': 'cat',
 'default': 'cat',
 'housing': 'cat',
 'loan': 'cat',
 'contact': 'cat',
 'month': 'cat',
 'day_of_week': 'cat',
 'duration': 'num',
 'campaign': 'num',
 'previous': 'cat',
 'poutcome': 'cat',
 'emp.var.rate': 'cat',
 'cons.price.idx': 'cat',
 'cons.conf.idx': 'num',
 'euribor3m': 'num',
 'nr.employed': 'num',
 'y': 'num',
 'pdays_1': 'cat'}

In [12]:
# encoding features
def encoder_x(df, prop):
    
    output = df.copy()
    for i in df.columns:
        if prop[i] == 'cat':
            temp = pd.get_dummies(df[i], prefix = i, prefix_sep = '_', drop_first = True)
            output= pd.concat([output, temp], axis = 1)
            output.drop(i, axis = 1, inplace = True)
        
    return(output)

# encoding predicted labels
def encoder_y(y):
    
    le = LabelEncoder()
    le.fit(y)
    y_enc = le.transform(y)
    
    return(y_enc)

def model_output(X_enc, y_enc, model):
    
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size = 0.2, random_state = 1)
    
    if model == 'log':
        clf = LogisticRegression(random_state = 1, max_iter = 2000).fit(X_train, y_train)
    elif model == 'rfc':
        clf = RandomForestClassifier(n_estimators = 100, oob_score = True, random_state = 1).fit(X_train, y_train)
    elif model == 'svc':
        clf = svm.SVC().fit(X_train, y_train)
    elif model == 'bag':
        clf = BaggingClassifier(base_estimator = SVC(), n_estimators = 20, random_state = 0).fit(X_train, y_train)
    elif model == 'bos':
        clf = GradientBoostingClassifier(random_state = 0).fit(X_train, y_train)
    else:
        return(None)
    
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict = True)
    acr, prec, recl, fscr = report['accuracy'], report['1']['precision'], report['1']['recall'], report['1']['f1-score']
    d = {'name': [model], 'acr': [acr], 'prec': [prec], 'recl': [recl], 'fsc':[fscr]}
    output = pd.DataFrame(d)
    
    return(output)

In [13]:
X, y = df.drop('y', axis = 1), df['y']
X_enc, y_enc = encoder_x(X, prop), encoder_y(y)

In [15]:
li = ['log', 'rfc', 'svc', 'bag', 'bos']
perf = pd.DataFrame()

for i in li:
    temp = model_output(X_enc, y_enc, i)
    perf = perf.append(temp)
    
perf

Unnamed: 0,name,acr,prec,recl,fsc
0,log,0.919903,0.59322,0.454545,0.514706
0,rfc,0.915049,0.577778,0.337662,0.42623
0,svc,0.90534,0.0,0.0,0.0
0,bag,0.90534,0.0,0.0,0.0
0,bos,0.923544,0.612903,0.493506,0.546763


The F-1 score improves significantly with the newly added features. Note that the precison and recall become zero with SVC and Bagging since all predictions are 0 (not success).

In [16]:
def gridcv(X_enc, y_enc, model, tuned_parameters):
    
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size = 0.2, random_state = 1)
    
    if model == 'log':
        m = LogisticRegression(random_state = 1, max_iter = 2000)
    elif model == 'rfc':
        m = RandomForestClassifier(n_estimators = 100, oob_score = True, random_state = 1)
    elif model == 'svc':
        m = svm.SVC()
    elif model == 'bag':
        m = BaggingClassifier(base_estimator = SVC(), n_estimators = 20, random_state = 0)
    elif model == 'bos':
        m = GradientBoostingClassifier(random_state = 0)
    else:
        return(None)
    
    clf = GridSearchCV(m, tuned_parameters, scoring='precision_macro')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict = True)
    acr, prec, recl, fscr = report['accuracy'], report['1']['precision'], report['1']['recall'], report['1']['f1-score']
    d = {'name': [model], 'best_para': [clf.best_params_], 'acr': [acr], 'prec': [prec], 
         'recl': [recl], 'fsc':[fscr]}
    output = pd.DataFrame(d)
    
    return(output)

In [17]:
li = ['log', 'rfc', 'bos']
perf = pd.DataFrame()
tuned_parameters_set = {'log':{'C': [0.1, 1, 10]}, 'rfc':{'max_features': ['auto', 'log2']}, 
                       'bos':{'n_estimators': [50, 100, 200]}}

for i in li:
    
    tuned_parameters = tuned_parameters_set[i]
    temp = gridcv(X_enc, y_enc, i, tuned_parameters)
    perf = perf.append(temp)
    
perf

Unnamed: 0,name,best_para,acr,prec,recl,fsc
0,log,{'C': 0.1},0.923544,0.62963,0.441558,0.519084
0,rfc,{'max_features': 'log2'},0.915049,0.577778,0.337662,0.42623
0,bos,{'n_estimators': 50},0.923544,0.625,0.454545,0.526316


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size = 0.2, random_state = 1)

model = Sequential()
model.add(Dense(12, input_dim = X_train.shape[1], activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.fit(X_train, y_train, epochs = 10, batch_size = 10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x24d75cfdb88>

In [19]:
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)
report = classification_report(y_test, y_pred, output_dict = True)
acr, prec, recl, fscr = report['accuracy'], report['1']['precision'], report['1']['recall'], report['1']['f1-score']
d = {'name': 'NN', 'acr': [acr], 'prec': [prec], 
     'recl': [recl], 'fsc':[fscr]}
output = pd.DataFrame(d)
output

Unnamed: 0,name,acr,prec,recl,fsc
0,NN,0.907767,0.505618,0.584416,0.542169


The NN model has further improvement in F1 score by trading off some precision, as compared with xgboost.