### Part 1: Random Forest implementation

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

In [3]:
bank_data = pd.read_csv('data/bank-full.csv', sep=';')

In [4]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

Editing the dataset to only includes features I want in my model

In [5]:
bank_data = bank_data.loc[:,['age','marital','default','housing','loan','cons.price.idx','cons.conf.idx','y']]

Mapping the several feature columns and the prediction column y to numerical values

In [6]:
# Unknown default is considered 'no'
bank_data['default'] = bank_data['default'].map({'no':0,'yes':1,'unknown':0})

# Unknown housing is considered 'no'
bank_data['housing'] = bank_data['housing'].map({'no':0,'yes':1,'unknown':0})

# Unknown loan is considered 'no'
bank_data['loan'] = bank_data['loan'].map({'no':0,'yes':1,'unknown':0})

# Unknown marital is considered single
bank_data['marital'] = bank_data['marital'].map({'divorced':0,'single':1,'married':2,'unknown':1})

bank_data['y'] = bank_data['y'].map({'no':0,'yes':1})

Randomizing the rows of the dataframe to ensure that the splitting of training and test sets is completely random

In [7]:
bank_data = bank_data.sample(frac=1).reset_index(drop=True)
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   marital         41188 non-null  int64  
 2   default         41188 non-null  int64  
 3   housing         41188 non-null  int64  
 4   loan            41188 non-null  int64  
 5   cons.price.idx  41188 non-null  float64
 6   cons.conf.idx   41188 non-null  float64
 7   y               41188 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 2.5 MB


Splitting dataset into training and test data, and initializing the RandomForest model

In [8]:
X = bank_data.drop('y', axis=1)
y = bank_data['y']

#rf1 = RandomForestClassifier()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# rf1.fit(X_train, y_train)
# y_pred = rf1.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# cm
# accuracy = accuracy_score(y_test, y_pred)
# print(accuracy)

In [9]:
kf = KFold(n_splits=10)
rf = RandomForestClassifier()
classifier_performance = []

for i, (train_indices, test_indices) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    # TP = cm[0][0]
    # FN = cm[1][0]
    # TN = cm[1][1]
    # FP = cm[0][1]
    P = TP + FN
    N = TN + FP
    TPR = TP/P
    TNR = TN/N
    FPR = FP/N
    FNR = FN/P
    r = TP/P
    p = TP/(TP+FP)
    F1 = 2*(p*r)/(p+r)
    Acc = (TP+TN)/(P+N)
    Err = (FP+FN)/(P+N)
    BACC = (TPR + TNR)/2
    TSS = TP/(TP+FN) - FP/(FP+TN)
    HSS = 2*(TP*TN - FP*FN) / ((TP+FN)*(FN+TN) + (TP+FP)*(FP+TN))
    classifier_performance.append([i, TP, TN, FP, FN, P, N, TPR, TNR, FPR, FNR, r, p, F1, Acc, Err, BACC, TSS, HSS])

In [10]:
columns = ['Index', 'TP', 'TN', 'FP', 'FN', 'P', 'N', 'TPR', 'TNR', 'FPR', 'FNR', 'r', 'p', 'F1', 'Acc', 'Err', 'BACC', 'TSS', 'HSS']
class_perf_df = pd.DataFrame(classifier_performance, columns=columns)
class_perf_df

Unnamed: 0,Index,TP,TN,FP,FN,P,N,TPR,TNR,FPR,FNR,r,p,F1,Acc,Err,BACC,TSS,HSS
0,0,115,3506,139,359,474,3645,0.242616,0.961866,0.038134,0.757384,0.242616,0.452756,0.315934,0.879097,0.120903,0.602241,0.204482,0.256207
1,1,119,3537,130,333,452,3667,0.263274,0.964549,0.035451,0.736726,0.263274,0.477912,0.339515,0.887594,0.112406,0.613912,0.227823,0.283672
2,2,116,3515,131,357,473,3646,0.245243,0.96407,0.03593,0.754757,0.245243,0.469636,0.322222,0.881525,0.118475,0.604657,0.209313,0.264254
3,3,103,3556,125,335,438,3681,0.23516,0.966042,0.033958,0.76484,0.23516,0.451754,0.309309,0.888322,0.111678,0.600601,0.201202,0.255073
4,4,102,3548,121,348,450,3669,0.226667,0.967021,0.032979,0.773333,0.226667,0.457399,0.30312,0.886137,0.113863,0.596844,0.193688,0.248728
5,5,115,3518,152,334,449,3670,0.256125,0.958583,0.041417,0.743875,0.256125,0.430712,0.321229,0.88201,0.11799,0.607354,0.214708,0.261163
6,6,121,3511,143,344,465,3654,0.260215,0.960865,0.039135,0.739785,0.260215,0.458333,0.331962,0.881767,0.118233,0.61054,0.22108,0.272476
7,7,135,3495,129,360,495,3624,0.272727,0.964404,0.035596,0.727273,0.272727,0.511364,0.355731,0.881282,0.118718,0.618566,0.237131,0.296957
8,8,114,3515,127,362,476,3642,0.239496,0.965129,0.034871,0.760504,0.239496,0.473029,0.317992,0.881253,0.118747,0.602312,0.204625,0.260531
9,9,103,3513,137,365,468,3650,0.220085,0.962466,0.037534,0.779915,0.220085,0.429167,0.29096,0.878096,0.121904,0.591276,0.182551,0.231769


Metrics for Average Fold

In [35]:
class_perf_df = class_perf_df.fillna(0)

average_row = class_perf_df.mean(axis=0)
average_row = average_row.drop(average_row.index[0])
df = pd.DataFrame(average_row).T
method = pd.DataFrame([{'Method': 'Random Forest'}])
df.insert(0, 'Method', method)
df.squeeze()

Method    Random Forest
TP                114.3
TN               3521.4
FP                133.4
FN                349.7
P                 464.0
N                3654.8
TPR            0.246161
TNR            0.963499
FPR            0.036501
FNR            0.753839
r              0.246161
p              0.461206
F1             0.320797
Acc            0.882708
Err            0.117292
BACC            0.60483
TSS             0.20966
HSS            0.263083
Name: 0, dtype: object

The Random Forests model has an average of 0.8827 accuracy.