### Part 3: LSTM Implementation

In [68]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

from keras.models import Sequential
from keras.layers import LSTM, Dense

In [2]:
bank_data = pd.read_csv('data/bank-full.csv', sep=';')
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

Editing the dataset to only include features I want in my model

In [3]:
bank_data = bank_data.loc[:,['age','marital','default','housing','loan','cons.price.idx','cons.conf.idx','y']]

Mapping the several feature columns and the prediction column y to numerical values

In [4]:
# Unknown default is considered 'no'
bank_data['default'] = bank_data['default'].map({'no':0,'yes':1,'unknown':0})

# Unknown housing is considered 'no'
bank_data['housing'] = bank_data['housing'].map({'no':0,'yes':1,'unknown':0})

# Unknown loan is considered 'no'
bank_data['loan'] = bank_data['loan'].map({'no':0,'yes':1,'unknown':0})

# Unknown marital is considered single
bank_data['marital'] = bank_data['marital'].map({'divorced':0,'single':1,'married':2,'unknown':1})

bank_data['y'] = bank_data['y'].map({'no':0,'yes':1})

In [5]:
bank_data = bank_data.sample(frac=1).reset_index(drop=True)
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   marital         41188 non-null  int64  
 2   default         41188 non-null  int64  
 3   housing         41188 non-null  int64  
 4   loan            41188 non-null  int64  
 5   cons.price.idx  41188 non-null  float64
 6   cons.conf.idx   41188 non-null  float64
 7   y               41188 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 2.5 MB


In [8]:
x = bank_data.drop('y', axis=1)
y = bank_data['y']

MinMaxScaler = preprocessing.MinMaxScaler()
X = MinMaxScaler.fit_transform(x)
X = pd.DataFrame(X, columns=['age', 'marital', 'default', 'housing', 'loan', 'cons.price.idx', 'cons.conf.idx'])

Creating the model

In [66]:
model = Sequential()
# model.add(LSTM(100))
# model.add(LSTM(49))
model.add(Dense(1))
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [89]:
kf = KFold(n_splits=10)
classifier_performance = []

for i, (train_indices, test_indices) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]

    model.fit(X_train.to_numpy(), y_train.to_numpy(), batch_size=len(X_train), epochs=10)

    y_pred = model.predict(X_test)
    y_pred[y_pred <= 0] = 0
    y_pred[y_pred > 0] = 1
    y_pred
    
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    # TP = cm[0][0]
    # FN = cm[1][0]
    # TN = cm[1][1]
    # FP = cm[0][1]
    P = TP + FN
    N = TN + FP
    TPR = TP/P
    TNR = TN/N
    FPR = FP/N
    FNR = FN/P
    r = TP/P
    p = TP/(TP+FP)
    F1 = 2*(p*r)/(p+r)
    Acc = (TP+TN)/(P+N)
    Err = (FP+FN)/(P+N)
    BACC = (TPR + TNR)/2
    TSS = TP/(TP+FN) - FP/(FP+TN)
    HSS = 2*(TP*TN - FP*FN) / ((TP+FN)*(FN+TN) + (TP+FP)*(FP+TN))
    classifier_performance.append([i, TP, TN, FP, FN, P, N, TPR, TNR, FPR, FNR, r, p, F1, Acc, Err, BACC, TSS, HSS])

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.8834 - loss: 1.1926
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.8837 - loss: 1.1928
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.8839 - loss: 1.1926
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.8839 - loss: 1.1925
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.8839 - loss: 1.1926
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step - accuracy: 0.8839 - loss: 1.1923
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.8844 - loss: 1.1922
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.8844 - loss: 1.1919
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [91]:
columns = ['Index', 'TP', 'TN', 'FP', 'FN', 'P', 'N', 'TPR', 'TNR', 'FPR', 'FNR', 'r', 'p', 'F1', 'Acc', 'Err', 'BACC', 'TSS', 'HSS']
class_perf_df = pd.DataFrame(classifier_performance, columns=columns)
class_perf_df

Unnamed: 0,Index,TP,TN,FP,FN,P,N,TPR,TNR,FPR,FNR,r,p,F1,Acc,Err,BACC,TSS,HSS
0,0,207,2059,1620,233,440,3679,0.470455,0.559663,0.440337,0.529545,0.470455,0.1133,0.18262,0.550134,0.449866,0.515059,0.030117,0.012614
1,1,205,2086,1600,228,433,3686,0.473441,0.565925,0.434075,0.526559,0.473441,0.113573,0.183199,0.556203,0.443797,0.519683,0.039366,0.016415
2,2,219,2055,1567,278,497,3622,0.440644,0.567366,0.432634,0.559356,0.440644,0.12262,0.191853,0.552076,0.447924,0.504005,0.00801,0.00378
3,3,215,2095,1546,263,478,3641,0.449791,0.575391,0.424609,0.550209,0.449791,0.12209,0.19205,0.560816,0.439184,0.512591,0.025182,0.011627
4,4,209,2126,1551,233,442,3677,0.472851,0.578189,0.421811,0.527149,0.472851,0.11875,0.189827,0.566885,0.433115,0.52552,0.051039,0.022078
5,5,214,2079,1552,274,488,3631,0.438525,0.57257,0.42743,0.561475,0.438525,0.121178,0.189885,0.556689,0.443311,0.505547,0.011094,0.0052
6,6,213,2047,1589,270,483,3636,0.440994,0.562981,0.437019,0.559006,0.440994,0.118202,0.186433,0.548677,0.451323,0.501988,0.003975,0.00182
7,7,202,2050,1595,272,474,3645,0.42616,0.562414,0.437586,0.57384,0.42616,0.11241,0.177895,0.546735,0.453265,0.494287,-0.011425,-0.00516
8,8,206,2110,1533,269,475,3643,0.433684,0.579193,0.420807,0.566316,0.433684,0.118459,0.186089,0.562409,0.437591,0.506439,0.012877,0.00597
9,9,202,2088,1600,228,430,3688,0.469767,0.566161,0.433839,0.530233,0.469767,0.112098,0.181004,0.556095,0.443905,0.517964,0.035928,0.014912


Metrics for Average Fold

In [93]:
class_perf_df = class_perf_df.fillna(0)

average_row = class_perf_df.mean(axis=0)
average_row = average_row.drop(average_row.index[0])
df = pd.DataFrame(average_row).T
method = pd.DataFrame([{'Method': 'LSTM'}])
df.insert(0, 'Method', method)
df.squeeze()

Method        LSTM
TP           209.2
TN          2079.5
FP          1575.3
FN           254.8
P            464.0
N           3654.8
TPR       0.451631
TNR       0.568985
FPR       0.431015
FNR       0.548369
r         0.451631
p         0.117268
F1        0.186085
Acc       0.555672
Err       0.444328
BACC      0.510308
TSS       0.020616
HSS       0.008926
Name: 0, dtype: object

The KNN method has an average accuracy of 0.5557.