### Part 2: KNN implementation

In [22]:
import pandas as pd
import numpy as np

import matplotlib as plt

from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

In [23]:
bank_data = pd.read_csv('data/bank-full.csv', sep=';')
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

Editing the dataset to only include features I want in my model

In [24]:
bank_data = bank_data.loc[:,['age','marital','default','housing','loan','cons.price.idx','cons.conf.idx','y']]

Mapping the several feature columns and the prediction column y to numerical values

In [25]:
# Unknown default is considered 'no'
bank_data['default'] = bank_data['default'].map({'no':0,'yes':1,'unknown':0})

# Unknown housing is considered 'no'
bank_data['housing'] = bank_data['housing'].map({'no':0,'yes':1,'unknown':0})

# Unknown loan is considered 'no'
bank_data['loan'] = bank_data['loan'].map({'no':0,'yes':1,'unknown':0})

# Unknown marital is considered single
bank_data['marital'] = bank_data['marital'].map({'divorced':0,'single':1,'married':2,'unknown':1})

bank_data['y'] = bank_data['y'].map({'no':0,'yes':1})

In [26]:
bank_data = bank_data.sample(frac=1).reset_index(drop=True)
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   marital         41188 non-null  int64  
 2   default         41188 non-null  int64  
 3   housing         41188 non-null  int64  
 4   loan            41188 non-null  int64  
 5   cons.price.idx  41188 non-null  float64
 6   cons.conf.idx   41188 non-null  float64
 7   y               41188 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 2.5 MB


In [27]:
x = bank_data.drop('y', axis=1)
y = bank_data['y']

MinMaxScaler = preprocessing.MinMaxScaler()
X = MinMaxScaler.fit_transform(x)
X = pd.DataFrame(X, columns=['age', 'marital', 'default', 'housing', 'loan', 'cons.price.idx', 'cons.conf.idx'])

In [28]:
kf = KFold(n_splits=10)
knn = KNeighborsClassifier()
classifier_performance = []

for i, (train_indices, test_indices) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    # TP = cm[0][0]
    # FN = cm[1][0]
    # TN = cm[1][1]
    # FP = cm[0][1]
    P = TP + FN
    N = TN + FP
    TPR = TP/P
    TNR = TN/N
    FPR = FP/N
    FNR = FN/P
    r = TP/P
    p = TP/(TP+FP)
    F1 = 2*(p*r)/(p+r)
    Acc = (TP+TN)/(P+N)
    Err = (FP+FN)/(P+N)
    BACC = (TPR + TNR)/2
    TSS = TP/(TP+FN) - FP/(FP+TN)
    HSS = 2*(TP*TN - FP*FN) / ((TP+FN)*(FN+TN) + (TP+FP)*(FP+TN))
    classifier_performance.append([i, TP, TN, FP, FN, P, N, TPR, TNR, FPR, FNR, r, p, F1, Acc, Err, BACC, TSS, HSS])

In [29]:
columns = ['Index', 'TP', 'TN', 'FP', 'FN', 'P', 'N', 'TPR', 'TNR', 'FPR', 'FNR', 'r', 'p', 'F1', 'Acc', 'Err', 'BACC', 'TSS', 'HSS']
class_perf_df = pd.DataFrame(classifier_performance, columns=columns)
class_perf_df

Unnamed: 0,Index,TP,TN,FP,FN,P,N,TPR,TNR,FPR,FNR,r,p,F1,Acc,Err,BACC,TSS,HSS
0,0,78,3555,113,373,451,3668,0.172949,0.969193,0.030807,0.827051,0.172949,0.408377,0.242991,0.88201,0.11799,0.571071,0.142142,0.190235
1,1,101,3560,112,346,447,3672,0.225951,0.969499,0.030501,0.774049,0.225951,0.474178,0.306061,0.888808,0.111192,0.597725,0.19545,0.253792
2,2,120,3504,128,367,487,3632,0.246407,0.964758,0.035242,0.753593,0.246407,0.483871,0.326531,0.879825,0.120175,0.605582,0.211164,0.268138
3,3,112,3514,137,356,468,3651,0.239316,0.962476,0.037524,0.760684,0.239316,0.449799,0.312413,0.880311,0.119689,0.600896,0.201792,0.253502
4,4,108,3540,117,354,462,3657,0.233766,0.968007,0.031993,0.766234,0.233766,0.48,0.31441,0.885652,0.114348,0.600886,0.201773,0.260047
5,5,102,3532,117,368,470,3649,0.217021,0.967936,0.032064,0.782979,0.217021,0.465753,0.296081,0.882253,0.117747,0.592479,0.184958,0.241028
6,6,103,3531,123,362,465,3654,0.221505,0.966338,0.033662,0.778495,0.221505,0.455752,0.298119,0.882253,0.117747,0.593922,0.187844,0.242156
7,7,96,3542,105,376,472,3647,0.20339,0.971209,0.028791,0.79661,0.20339,0.477612,0.28529,0.883224,0.116776,0.5873,0.174599,0.232775
8,8,109,3557,115,337,446,3672,0.244395,0.968682,0.031318,0.755605,0.244395,0.486607,0.325373,0.890238,0.109762,0.606538,0.213077,0.272703
9,9,108,3513,133,364,472,3646,0.228814,0.963522,0.036478,0.771186,0.228814,0.448133,0.302945,0.87931,0.12069,0.596168,0.192335,0.244398


Metrics for Average Fold

In [30]:
class_perf_df = class_perf_df.fillna(0)

average_row = class_perf_df.mean(axis=0)
average_row = average_row.drop(average_row.index[0])
df = pd.DataFrame(average_row).T
method = pd.DataFrame([{'Method': 'KNN'}])
df.insert(0, 'Method', method)
df.squeeze()

Method         KNN
TP           103.7
TN          3534.8
FP           120.0
FN           360.3
P            464.0
N           3654.8
TPR       0.223351
TNR       0.967162
FPR       0.032838
FNR       0.776649
r         0.223351
p         0.463008
F1        0.301021
Acc       0.883388
Err       0.116612
BACC      0.595257
TSS       0.190513
HSS       0.245877
Name: 0, dtype: object

The KNN method has an average accuracy of 0.8834.