In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
df = pd.read_csv('../input/bank-marketing/bank-additional-full.csv', delimiter=';')

In [None]:
df = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan','campaign', 'pdays',
       'previous', 'poutcome', 'y']]
df.info()

In [None]:
### Encoding Categorical Features

objfeatures = df.select_dtypes(include="object").columns
le = preprocessing.LabelEncoder()

for feat in objfeatures:
    df[feat] = le.fit_transform(df[feat].astype(str))

In [None]:
### Normalize Data
X = df.drop('y', 1)
y = df['y']

X = preprocessing.StandardScaler().fit_transform(X.astype(int))

In [None]:
### train/test split and train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [None]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(weights='distance', n_neighbors=i).fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='black', linestyle='dashed', 
         marker='o',markerfacecolor='yellow', markersize=6)
plt.title('Error Rate vs. K Value with Distance Metric: Euclidean')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:",min(error_rate),"at K =",error_rate.index(min(error_rate)))

In [None]:
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
    neigh = KNeighborsClassifier(weights='distance', n_neighbors=i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color='black', linestyle='dashed', 
         marker='o',markerfacecolor='yellow', markersize=6)
plt.title('Accuracy vs. K Value with Distance Metric: Euclidean')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))

In [None]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(weights='distance', n_neighbors=i, p=1).fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='black', linestyle='dashed', 
         marker='o',markerfacecolor='yellow', markersize=6)
plt.title('Error Rate vs. K Value with Distance Metric: Manhattan')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:",min(error_rate),"at K =",error_rate.index(min(error_rate)))

In [None]:
acc = []
for i in range(1,40):
    neigh = KNeighborsClassifier(weights='distance', n_neighbors=i, p=1).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color='black', linestyle='dashed', 
         marker='o',markerfacecolor='yellow', markersize=6)
plt.title('Accuracy vs. K Value with Distance Metric: Manhattan')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))