#  KNN CLASSIFIER

In [1]:
import numpy as np
from statistics import mode

class K_Nearest_Neighbours:
    def __init__(self, k=3):
        self.k = k
        
    def euclidian_distance(self, p, q):
         return np.sqrt((np.sum((p-q)**2)))
    
    def fit(self, X, y):        
        self.x_train = X # ITS KNN so these data need to be stored and not trained
        self.y_train = y
        
    def predict(self, X):
        pred_labels = [self._predict(x) for x in X]
        return np.array(pred_labels)
    
    def _predict(self, x):
        # calculate distance
        distances = [self.euclidian_distance(x, x_train) for x_train in self.x_train]
       
        #sort the distances, find k nearest neighbours and their labels
        k_index = np.argsort(distances)[:self.k]    #argsort gives index after sorting
        k_index_labels = [self.y_train[i] for i in k_index]
        
        #finding most common
        return mode(k_index_labels)
        
        

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
X = iris['data']

In [5]:
y = iris['target']

In [6]:
X.shape

(150, 4)

In [7]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=32)

In [8]:
Xtrain.shape

(105, 4)

In [9]:
ytrain.shape

(105,)

In [10]:
my_knn = K_Nearest_Neighbours(k=5)
my_knn.fit(Xtrain, ytrain)

In [11]:
my_knn_prediction = my_knn.predict(Xtest)

In [12]:
my_knn_prediction

array([1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 1, 2, 1, 1, 2, 2, 1, 2, 1, 0, 0, 2,
       2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 0, 2, 2, 1, 0, 2, 0, 2,
       0])

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

In [14]:
knn.fit(Xtrain, ytrain)

KNeighborsClassifier()

In [15]:
prediction = knn.predict(Xtest)

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
print("MY KNN MODEL ACCURACY SCORE :-",accuracy_score(ytest, my_knn_prediction))

MY KNN MODEL ACCURACY SCORE :- 1.0


In [18]:
print("SKLEARN KNN ACCURACY SCORE :-",accuracy_score(ytest, prediction))

SKLEARN KNN ACCURACY SCORE :- 1.0


<u><b>ASSUMPTION</b></u>

<br>
The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other.
<br>

<u><b>Advantages:-</b></u><br>
1. No Training Period- KNN modeling does not include training period as the data itself is a model which will be the reference for future prediction and because of this it is very time efficient in term of improvising for a random modeling on the available data.

<br>

2. Easy Implementation- KNN is very easy to implement as the only thing to be calculated is the distance between different points on the basis of data of different features and this distance can easily be calculated using distance formula such as- Euclidian or Manhattan

<br>

3. As there is no training period thus new data can be added at any time since it wont affect the model.

<u><b>Disadvantages:-</b></u><br>
1. Does not work well with large dataset as calculating distances between each data instance would be very costly.

<br>

2. Does not work well with high dimensionality as this will complicate the distance calculating process to calculate distance for each dimension.

<br>

3. Sensitive to noisy and missing data

<br>

4. Feature Scaling- Data in all the dimension should be scaled (normalized and standardized) properly .