In [1]:
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Inheriting BaseEstimator & TransformerMixin to make the class compatible with sklearn components
class KNNClassifier(BaseEstimator, TransformerMixin): 
    '''
    KNN classifier built from scratch
    
    Params:
     k (int): number of clusters
    '''
    def __init__(self, k: int=3):
        self.k = k
        self.x = None
        self.y = None
    
    
    @staticmethod
    def __euclidean_distance(x1, x2):
        '''
        Compute euclidean distance between arrays x1 and x2
        '''
        return np.sqrt(np.sum((x1 - x2) ** 2, axis=1))

    
    def fit(self, x: np.array, y: np.array):
        '''
        Stores x and y
        
        Params:
         x (numpy.array): Input features
         y (numpy.array): labels, strictly [0, n-1] (both inclusive)
        '''
        self.x = x
        self.y = y
        return self
    
    
    def predict(self, x: np.array):
        '''
        Prediction function
        
        Params:
         x (numpy.array): Input features for prediction
         
        Returns: 
         numpy.array: predicted labels
        
        '''
        # 1. computing euclidean distance between each sample in test set and training set
        # 2. Arg sorting the distances such that sample indices with lowest distance come first
        # 3. Selecting top 'k' sample indices
        # 4. Selecting labels of top 'k' sample indices
        # 5. sorting the labels such that in case of a tie, the first prediction is returned based on alphabetical order of label as in sklearn
        # 6. Using Counter to get the counts of labels
        # 7. Finding the most common label and extracting it
        pred = [Counter(sorted(self.y[np.argsort(self.__euclidean_distance(sample, self.x))[:self.k]])).most_common(1)[0][0] for sample in x]
        return pred
    
    
    def __repr__(self):
        return f"KNNClassifier(k={self.k})"

In [3]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
data = load_iris()

In [5]:
x = data['data']
y = data['target']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
model_pipeline = Pipeline(steps=[['minmax_scaler', MinMaxScaler()],
                                  ['knn', KNNClassifier()]])

In [8]:
param_grid = {'knn__k': [*range(1, 16, 2)]}

In [9]:
grid_search = GridSearchCV(estimator=model_pipeline, 
                           param_grid=param_grid,
                           scoring="accuracy",
                           cv=3)

In [10]:
grid_search.fit(x_train, y_train)

In [11]:
grid_search.best_params_

{'knn__k': 5}

In [12]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__k,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000992,0.001403,0.002306,0.000464,1,{'knn__k': 1},0.925,0.925,0.975,0.941667,0.02357,8
1,0.000535,0.000422,0.000321,0.000454,3,{'knn__k': 3},0.95,0.95,0.975,0.958333,0.011785,3
2,0.001958,0.002104,0.001331,0.000445,5,{'knn__k': 5},0.975,0.95,1.0,0.975,0.020412,1
3,0.0,0.0,0.000525,0.000742,7,{'knn__k': 7},0.925,0.95,1.0,0.958333,0.03118,3
4,0.000341,0.000483,0.003703,0.004568,9,{'knn__k': 9},0.925,0.95,0.975,0.95,0.020412,5
5,0.000666,0.000471,0.000994,3e-06,11,{'knn__k': 11},0.925,0.95,0.975,0.95,0.020412,5
6,0.0,0.0,0.00161,0.000437,13,{'knn__k': 13},0.925,0.95,0.975,0.95,0.020412,5
7,0.000691,0.00049,0.000997,6e-06,15,{'knn__k': 15},0.95,0.975,1.0,0.975,0.020412,1


In [13]:
pred = grid_search.predict(x_test)
print(pred)

[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 1, 0, 2, 0]


In [14]:
print([*y_test])

[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]


In [15]:
# Test Accuracy
accuracy_score(y_test, pred)

0.9666666666666667