# KNN

- [Source](https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/knn.ipynb)

Algorithm:
1. Choose a value of k (the number of NN to consider)
    - Small value of k --> more flexible decision boundary --> overfitting
    - Large value of k --> smoother decision boundary --> less prone to overfitting
2. For each data point in the test set, compute its distance (Euclidian, Manhatten, Minkowski) to all data points in the training set. 
3. Select the k-NN based on their distances.
4. Assign the class label that apprears most frequently among the k NN to the test point.

KNN can also be used for regression tasks, where the goal is to predict a continous value instead of class labels. The predicted value for the test point is the average of the values of its k NN in the training set.

In [9]:
from collections import Counter
import numpy as np

class KNN:
    def __init__(self, k=3, distance='euclidean'):
        self.k = k
        self.distances = distance
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = []

        for x in X_test:
            if self.distances == 'minkowski':
                distances = np.power(np.sum(np.power(np.abs(self.X_train - x), self.distance), axis=1), 1/self.distance)
            elif self.distances == 'manhattan':
                distances = np.sum(np.abs(self.X_train - x), axis=1)
            else:
                distances = np.linalg.norm(self.X_train - x, axis=1)
                
            nearest_indices = np.argsort(distances)[:self.k]
            nearest_labels = self.y_train[nearest_indices]

            # Assign the class label that appreas most fequently among the k NN. 
            label = Counter(nearest_labels).most_common(1)[0][0]
            y_pred.append(label)
        
        return np.array(y_pred)

In [10]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = load_iris()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Create a KNN classifier with k=5 and euclidean distance
knn = KNN(k=5, distance='euclidean')

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Compute the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
