In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('./dataset/cleandata.csv', index_col=[0])

In [3]:
x = df.drop('income', axis=1)
y = df['income']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [5]:
# scale the training and testing data

scaler = StandardScaler() 
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Baseline Model

In [6]:
# k=1, we have searched for the optimum value of k, for details please go to part2, model comparison

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)

In [8]:
print(f"{classification_report(y_test, knn_pred)}")

              precision    recall  f1-score   support

           0       0.92      0.84      0.88     11155
           1       0.85      0.93      0.89     11138

    accuracy                           0.88     22293
   macro avg       0.89      0.88      0.88     22293
weighted avg       0.89      0.88      0.88     22293



In [18]:
base_acc = accuracy_score(y_test, knn_pred)
print(base_acc)

0.8861526039563988


## Apply Grid Search on KNN

In [14]:
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

for i in weights:
    for j in metric:
        knn = KNeighborsClassifier(n_neighbors=1, weights=i, metric=j)
        knn.fit(x_train, y_train)
        knn_pred = knn.predict(x_test)
        acc = accuracy_score(y_test, knn_pred)
        print('\n', i,'+', j)
        print(acc)


 uniform + euclidean
0.8845377472749294

 uniform + manhattan
0.8861526039563988

 uniform + minkowski
0.8845377472749294

 distance + euclidean
0.8845377472749294

 distance + manhattan
0.8861526039563988

 distance + minkowski
0.8845377472749294


In [None]:
# best params = {n_neignbors=1, weights='uniform' , metric='manhattan'}
# same as       {n_neignbors=1, weights='distance' , metric='manhattan'}   

## KNN with best params

In [21]:
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', metric='manhattan')
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)

In [22]:
print(f"{classification_report(y_test, knn_pred)}")

              precision    recall  f1-score   support

           0       0.92      0.85      0.88     11155
           1       0.86      0.93      0.89     11138

    accuracy                           0.89     22293
   macro avg       0.89      0.89      0.89     22293
weighted avg       0.89      0.89      0.89     22293



In [23]:
bestparams_acc = accuracy_score(y_test, knn_pred)
print(bestparams_acc)

0.8861526039563988


### improvement:

In [28]:
improvement = (bestparams_acc - base_acc)/base_acc * 100
print(improvement, '%')

0.0 %


In [None]:
# no improvement shown

# In KNN model, the most important parameter is the value of k,
# since we have already searched for the optimum value of k and applied it in the baseline model
# the other params that used in grid search don't help in further enhancing the accuracy