### Task-2
## K-Nearest Neighbors (KNN) Classifier - From Scratch

Let's kick-off by importing modules and writing helper functions

In [112]:
import numpy as np
from statistics import multimode

def calculate_distance(a,b,p):
    distance = None
    if p==None:
        distance = 1 - (a.T@b)/(np.linalg.norm(a) * np.linalg.norm(b))
    else:
        distance = (a-b)
        distance = np.abs(distance)
        distance = distance**p
        distance = np.sum(distance)
        distance = distance**(1/p)

    return distance

def find_mode(numbers):
    #getting an array of mode(s)
    mode = multimode(numbers)

    #Return the mode
    #return randmoly if multiple modes found
    return np.random.choice(mode)

Here comes our implementation of KNN

In [158]:
class KNN_Classifier():
    """
        A simple implementation of the KNN algorithm.

        parameters:
            k               : how many neighbours to consider for predicting the label
            distance_metric : on what basis to find the nearest neighbors. Currently supports "euclidean", "manhattan", "minkowski", "cosine".
            p               : p value required for minkowski distance. pass 'np.inf' for L-infinity norm
            is_weighted     : Set to True if you want to implement weighted KNN.

        Methods:
            train_test_split() : splits the data into train and test data according to the parameter train_data (which tells what proportion of the total data to be taken for trainig). Returns 4 arrays.
            fit()              : Stores the input data to predict label for new points
            predict()          : predicts the label for a set of given points
            predict_one()      : predicts the label for a single given point
            accuracy_checker() : Compares the predicted label with the actual labels to calculate the accuracy
    """


    def __init__(self,k=3, distance_metric="euclidean", p=None, is_weighted=False):

        #checking if invalid parameter passed for distance_metric
        if(distance_metric not in ["euclidean", "manhattan", "minkowski", "cosine"]):
            raise ValueError('distance_metric can only take values "euclidean", "manhattan", "minkowski", "cosine"')

        #checking if valid p is supplied in case of minkowski
        if(distance_metric == "minkowski" and (p==None or p<1)):
            raise ValueError("The value of p should be a nunmerical value greater than or equal to 1")

        #everything's fine, let's proceed
        self.k = k
        self.distance_metric = distance_metric
        self.is_weighted = is_weighted

        #We will get the data when the fit function is called
        self.X = None
        self.y = None

        #assigning self.p
        if(distance_metric=="euclidean"):
            self.p=2
        elif(distance_metric=="manhattan"):
            self.p=1
        elif(distance_metric=="minkowski"):
            self.p=p
        elif(distance_metric=="cosine"):
            self.p=None

        #done with the initialization
########################################################################



    def train_test_split(self, X, y, train_data =0.7):
        #checking if dimensions of X and y match
        if(len(X) != len(y)):
            raise SizeError(f"The size of X ({len(X)}) does not match with that of y ({len(y)})")

        #checking if valid train_data parameter passed or not
        if(train_data<=0 or train_data >1):
            raise ValueError(f"Invalid value for train_data. It should belong to (0,1]")

        #shuffling the indices
        indices = range(len(X))
        np.random.shuffle(indices)

        X_train = []
        y_train = []
        X_test  = []
        y_test  =[]

        #taking the first train_data proportion of indoces for training and later for test
        num_training_points = len(X)//train_data
        for i in range(num_training_points+1):
            X_train.append(X[i])
            y_train.append(y[i])

        for i in range(num_training_points+1, len(X)):
            X_test.append(X[i])
            y_test.append(y[i])

        X_train = np.array(X_train)
        X_test = np.array(X_test)
        y_train = np.array(y_train)
        y_test = np.array(y_test)

        return X_train, y_train, X_test, y_test
########################################################################



    def fit(self, X, y):
        #checking if dimensions of X and y match
        if(len(X) != len(y)):
            raise SizeError(f"The size of X ({len(X)}) does not match with that of y ({len(y)})")

        #everything is fine, let's store the data
        self.X = X
        self.y = y
########################################################################


    def predict(self, X_test):
        predicted = []

        for x_test in X_test:
            #first calculating the distances
            distances = []
            for x_data in self.X:
                distances.append(calculate_distance(x_test, x_data, self.p))

            #now we have distances array
            distances = np.array(distances)
            sorted_indices = np.argsort(distances)
            k_nearest_indices = sorted_indices[:self.k]

            #if data point exists in X, return the same label
            if(distances[sorted_indices[0]] == 0):
                predicted.append(self.y[sorted_indices[0]])

            #else we will predict
            else:
                #if weighted
                if(self.is_weighted):
                    freq = np.zeros(3)

                    distances_sum = sum([distances[index] for index in k_nearest_indices])
                    for index in k_nearest_indices:
                        freq[y[index]] += 1/distances[index]

                    predicted_label = np.argmax(freq)

                #normal KNN
                else:
                    labels = [self.y[index] for index in k_nearest_indices]
                    predicted_label = find_mode(labels)

                predicted.append(predicted_label)

        #We have predicted them completely, lets return them
        return predicted
########################################################################


    def predict_one(self,x):
        #first calculating the distances
        distances = []
        for x_data in self.X:
            distances.append(calculate_distance(x, x_data, self.p))

        #now we have distances array
        distances = np.array(distances)
        sorted_indices = np.argsort(distances)
        k_nearest_indices = sorted_indices[:self.k]

        #if data point exists in X, return the same label
        if(distances[sorted_indices[0]] == 0):
            return (self.y[sorted_indices[0]])

        #else we will predict
        else:
            #if weighted
            if(self.is_weighted):
                freq = np.zeros(3)
    
                distances_sum = sum([distances[index] for index in k_nearest_indices])
                for index in k_nearest_indices:
                    freq[y[index]] += 1/distances[index]
    
                return np.argmax(freq)
    
            #normal KNN
            else:
                labels = [self.y[index] for index in k_nearest_indices]
                return find_mode(labels)
########################################################################


    def accuracy_checker(self, y_predicted, y_true):

        if(len(y_predicted) != len(y_true)):
            raise SizeError("Size of given arrays do not match.")


        total = len(y_predicted)
        correct = 0

        for i in range(len(y_predicted)):
            if(y_predicted[i]==y_true[i]):
                correct+=1

        return correct/total


Lets use this implementation, and test it 

In [164]:
### given data
data = [
    [150, 7.0, 1, 'Apple'],
    [120, 6.5, 0, 'Banana'],
    [180, 7.5, 2, 'Orange'],
    [155, 7.2, 1, 'Apple'],
    [110, 6.0, 0, 'Banana'],
    [190, 7.8, 2, 'Orange'],
    [145, 7.1, 1, 'Apple'],
    [115, 6.3, 0, 'Banana']
]

#creating an instance of our class
knn_classifier = KNN_Classifier(k=4, is_weighted=True)

#splitting into features and labels
X, labels = np.array([row[:-1] for row in data]), np.array([row[-1] for row in data])

#encoding the y labels
#apples = 0, Banana = 1, Orange = 2
y=[]
for i in range(len(labels)):
    if(labels[i]=="Apple"):
        y.append(0)
    elif(labels[i]=="Banana"):
        y.append(1)
    else:
        y.append(2)

#fitting our model
knn_classifier.fit(X,y)


#test data
test_data = np.array([
    [118, 6.2, 0],  # Expected: Banana
    [160, 7.3, 1],  # Expected: Apple
    [185, 7.7, 2]   # Expected: Orange
])
y_test = np.array([1,0,2])

X_test = np.array(test_data)

y_predicted = knn_classifier.predict(X_test)
print(y_predicted)

print(f"The accuracy in this case is {knn_classifier.accuracy_checker(y_predicted, y_test) * 100}%.")

[np.int64(1), np.int64(0), np.int64(2)]
The accuracy in this case is 100.0%.


The output is Banana, Apple, Orange, which is what we expected. This implies that our implementation is working fine.

On playing around with $k$ values, we realize that only values till $3$ work fine with KNN, however Weighted-KNN works fine even for higher values of $k$.