## K - Nearest Neighbour Classification 

In [1]:
#Required Libraries
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

#note sklearn.cross_validation is depreciated.

In [2]:
#DATA
#Setting the working directory
os.chdir("C:/Users/Gourab/Documents")
#Reading the data
iris = pd.read_csv("iris.csv")
iris.sample(10)

Unnamed: 0.1,Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
121,122,5.6,2.8,4.9,2.0,virginica
147,148,6.5,3.0,5.2,2.0,virginica
47,48,4.6,3.2,1.4,0.2,setosa
18,19,5.7,3.8,1.7,0.3,setosa
141,142,6.9,3.1,5.1,2.3,virginica
54,55,6.5,2.8,4.6,1.5,versicolor
117,118,7.7,3.8,6.7,2.2,virginica
42,43,4.4,3.2,1.3,0.2,setosa
46,47,5.1,3.8,1.6,0.2,setosa
27,28,5.2,3.5,1.5,0.2,setosa


In [3]:
X = iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']]
y = iris['Species']

In [4]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, stratify=y, random_state = 100)

### Understanding 1-NN - Step by Step 

In [5]:
test = X_test.iloc[0,]

In [6]:
test

Sepal.Length    4.9
Sepal.Width     3.1
Petal.Length    1.5
Petal.Width     0.2
Name: 34, dtype: float64

In [None]:
#Find the nearest neighbour to test

In [None]:
#Eiclidean distance between the test vector and the obs. 1 of the training data
np.sqrt(np.sum((test - X_train.iloc[1,])**2))

In [None]:
dist = []

for i in range(len(X_train)):
    
    #Calculate the distance of test from the ith observation in X_train
    distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
    
    #Append the distance and the index as a tuple in the list 'dist'
    dist.append((distance, i))
    

In [None]:
#This is how the 'dist' list will look like
dist[:10]

In [None]:
#Sort the list by distance
dist.sort()
dist[:10]

In [None]:
dist[0][1]

In [None]:
#Note that the nearest observation from test is the 87th obs
#Let's get the index

neighbour = dist[0][1]
neighbour

In [None]:
#Get the nearest obs
X_train.iloc[neighbour,]

In [None]:
#Our test observation
X_test.iloc[0,]

In [None]:
#Get the predicted value of target
np.array(y_train)[87]

In [None]:
#The actual value is
np.array(y_test)[0]

### Aside (Tuples)

In [None]:
#Similarityies between tuples and strings
s = 'abc'
t = (1,2,3)

In [None]:
t[1]=20 #like strings we cannot do this

In [None]:
s[:1]+'e'+s[2:]

In [None]:
t[:1]+ (20,) + t[2:]

In [None]:
#Empty tuple
(20,)
type((20,))

In [None]:
'abc' < 'pqr'

In [None]:
(1,2,3) < (0,100,200)

In [None]:
(1,2,3) < (1,2,0)

In [None]:
#OR THIS WAY (1NN Alternative Approach)
dist1 = []

for i in range(len(X_train)):
    #Calculate the distance of test from the ith observation in X_train
    distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
    #Append the distance and the index as a tuple in the list 'dist'
    dist1.append(distance)

#dist1 = np.array(dist1)

In [None]:
min_dist = min(dist1)
min_dist

In [None]:
type(dist1)

In [None]:
dist1.index(min_dist)

In [None]:
#OR THIS WAY (the simplest way)
import math
min_dist = math.inf

for i in range(len(X_train)):
    #Calculate the distance of test from the ith observation in X_train
    distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))

    if distance < min_dist:
        min_dist = distance #Updating the min_dist 
        index = i

index

### Creating a function for 1NN for a single test observation

In [7]:
def predict_1nn(X_train, y_train, test):
    dist = []

    for i in range(len(X_train)):
        
        #Calculate the distance of test from the ith observation in X_train
        distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
        #Append the distance and the index as a tuple in the list 'dist'
        dist.append((distance, i))
            
    #Sort the list by distance
    dist.sort()
    
    #Get the row index of its nearest neighbour
    neighbour = dist[0][1]
    
    #Print the test observation
    print("The test observation is:\n")
    print(test)
    print("\n")
    
    #Print the nearest neighbour
    print("The nearest observation is:\n")
    print(X_train.iloc[neighbour, ])
    
    #Print the predicted value of the target
    print("\n")
    print("The predicted value is:")
    print(np.array(y_train)[neighbour])
        

In [8]:
predict_1nn(X_train, y_train, test)

The test observation is:

Sepal.Length    4.9
Sepal.Width     3.1
Petal.Length    1.5
Petal.Width     0.2
Name: 34, dtype: float64


The nearest observation is:

Sepal.Length    4.9
Sepal.Width     3.1
Petal.Length    1.5
Petal.Width     0.1
Name: 9, dtype: float64


The predicted value is:
setosa


### Extend the above function for 3-NN

In [9]:
def predict_knn(X_train, y_train, test, k=3):
    dist = []

    for i in range(len(X_train)):
        #Calculate the distance of test from the ith observation in X_train
        distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
        #Append the distance and the index as a tuple in the list 'dist'
        dist.append((distance, i))
        
    #Sort the list by distance
    dist.sort()    #Tuples contains [(distance, i)]
    
    #Get the row index of its nearest neighbour
    neighbours = []
    
    for j in range(k):
        neighbours.append(dist[j][1])
    
    
    #Print the test observation
    print("The test observation is:\n")
    print(test)
    print("\n")
    
    #Print the nearest neighbour and their actual values
    #Append the actual target in a list
    target = []
    print("The nearest observations are:\n")
    for l in range(k):
        print(X_train.iloc[neighbours[l], ])
        print("Actual Target: ", np.array(y_train)[neighbours[l]])
        print("\n")
        target.append(np.array(y_train)[neighbours[l]])
    
    #Create a function for counting
    count = {}
    for t in target:
        count[t] = count.get(t,0) + 1
    
    #Create a list with (value, key) pair 
    pair = []
    for key in count:
        pair.append((count[key], key))

    #Sort the pair and get the most occurring word
    pair.sort(reverse=True)
    pred = pair[0][1]
    
    print("The predicted value is:", pred)
    print(pair)
    return(pred)
    

In [11]:
predict_knn(X_train, y_train, X_test.iloc[3,], k=3)

The test observation is:

Sepal.Length    5.6
Sepal.Width     2.7
Petal.Length    4.2
Petal.Width     1.3
Name: 94, dtype: float64


The nearest observations are:

Sepal.Length    5.7
Sepal.Width     2.9
Petal.Length    4.2
Petal.Width     1.3
Name: 96, dtype: float64
Actual Target:  versicolor


Sepal.Length    5.5
Sepal.Width     2.6
Petal.Length    4.4
Petal.Width     1.2
Name: 90, dtype: float64
Actual Target:  versicolor


Sepal.Length    5.8
Sepal.Width     2.6
Petal.Length    4.0
Petal.Width     1.2
Name: 92, dtype: float64
Actual Target:  versicolor


The predicted value is: versicolor
[(3, 'versicolor')]


'versicolor'

### Modify the algorithm for kNN prediction only

In [12]:
def NN(X_train, y_train, test, k):
    dist = []

    for i in range(len(X_train)):
        #Calculate the distance of test from the ith observation in X_train
        distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
        #Append the distance and the index as a tuple in the list 'dist'
        dist.append((distance, i))
        
    #Sort the list by distance
    dist.sort()
    
    #Get the row index of its nearest neighbour
    neighbours = []
    
    for j in range(k):
        neighbours.append(dist[j][1])
    
    #Append the actual target in a list
    target = []
    for l in range(k):
        target.append(np.array(y_train)[neighbours[l]])
    
    #Create a function for counting
    count = {}
    for t in target:
        count[t] = count.get(t,0) + 1
    
    #Create a list with (value, key) pair 
    pairs = []
    for key in count:
        pairs.append((count[key], key))

    #Sort the pair and get the most occurring word
    pairs.sort(reverse=True)
    pred = pairs[0][1]
    
    return(pred)
    

In [13]:
NN(X_train, y_train, test, k=5)

'setosa'

### Modify the function to predict multiple test observations

In [14]:
def KNN_Classifier(X_train, y_train, X_test, k=3):
    
    pred = []
    for i in range(len(X_test)):
        pred.append(NN(X_train, y_train, X_test.iloc[i,], k))

    return(pred)

In [15]:
pred1 = KNN_Classifier(X_train, y_train, X_test, k=10)

In [16]:
pred1

['setosa',
 'versicolor',
 'virginica',
 'versicolor',
 'virginica',
 'setosa',
 'versicolor',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'virginica',
 'virginica',
 'virginica',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'setosa',
 'virginica',
 'versicolor',
 'setosa',
 'versicolor',
 'virginica',
 'virginica',
 'virginica',
 'setosa',
 'virginica',
 'virginica',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'virginica']

### Evaluating our Algorithm

In [None]:
from sklearn.metrics import confusion_matrix

c = confusion_matrix(y_test, pred1)
c

In [None]:
confusion_matrix(pred, pred1)

### Appendix

#### Checking the speed of the approaches

In [None]:
import time

In [None]:
#APPROACH 1

time1 = time.time()

dist = []

for i in range(len(X_train)):
    #Calculate the distance of test from the ith observation in X_train
    distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
    #Append the distance and the index as a tuple in the list 'dist'
    dist.append((distance, i))
dist.sort()
neighbour = dist[0][1]
print(neighbour)

time2 = time.time()
print("\nTime taken = ", (time2-time1)*1000)

In [None]:
#APPROACH 2

time1 = time.time()

dist1 = []

for i in range(len(X_train)):
    #Calculate the distance of test from the ith observation in X_train
    distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))
    #Append the distance and the index as a tuple in the list 'dist'
    dist1.append(distance)

min_dist = min(dist1)
min_dist
print(dist1.index(min_dist))

time2 = time.time()
print("\nTime taken = ", (time2-time1)*1000)

In [None]:
import math

In [None]:
#APPROACH 3
time1 = time.time()
min_dist = math.inf

for i in range(len(X_train)):
    #Calculate the distance of test from the ith observation in X_train
    distance = np.sqrt(np.sum((test - X_train.iloc[i,])**2))

    if distance < min_dist:
        min_dist = distance
        index = i

print(index)
time2 = time.time()
print("\nTime taken = ", (time2-time1)*1000)