# Classification: K Nearest Neighbours

#### Classification with Iris Dataset

The KNN Classifier labels a data point by observing its distance relative to the points present in the existing dataset

In [4]:
import tensorflow as tf
import numpy as np
from sklearn import datasets

In [147]:
k = 5 # No. of clusters

#### load and preprocess dataset


In [169]:
iris = datasets.load_iris()
x = iris.data
y = iris.target

In [170]:
x.shape

(150, 4)

In [171]:
flower_labels = ['iris setosa', 'iris virginica', 'iris versicolor']


In [172]:
# one hot encoding if labels
y_unique = np.unique(y)
y  = np.eye(y_unique.size)[y]


# Normalize features

x_min = x.min(axis=0)
x = (x - x_min) / (x.max(axis=0) - x_min)

# Split dataset

split = .85
n_features = x.shape[0]

np.random.shuffle(x)
train_idx = np.random.choice(n_features, size=int(n_features * split), replace=False)
test_idx = np.array(list(set(range(n_features))- set(train_idx)))

x_train = x[train_idx]
x_test = x[test_idx]

y_train = y[train_idx]
y_test = y[test_idx]

In [173]:
y_test.shape

(23, 3)

In [174]:
def get_distances(x, y, x_test, k):
    """
        Gets the Manhattan distances between the data points
        |x1 - x2|
    """
    d0 = tf.expand_dims(x_test, axis=1)
    d1 = tf.abs(tf.subtract(x_train, d0))
    return tf.reduce_sum(input_tensor=d1, axis=2)


In [175]:
def predict():
    """
        Predicts labels
    """
    
    distances = get_distances(x_train, y_train, x_test, k)
    
    # Get indices of the KNN
    _, k_indices = tf.nn.top_k(tf.negative(distances), k=k)

    # Get labels associated with top k indices
    k_labels = tf.gather(y_train, k_indices)
    print('k_lables: ', k_labels.shape)

    pred_sum = tf.reduce_sum(input_tensor=k_labels, axis=1)
    
    # Predict labels from the index of the maximum
    pred = tf.argmax(pred_sum, axis=1)
    
    return pred
    

In [176]:
def find_clusters():
    """
        Compares the predicted to actual labels
    """
    
    y_pred = predict()
    true_pred = zip(y_pred, y_test)
    print('Pred        Actual')
    
    acc = np.sum(y_pred == y_test)
    
    for i, (pred, true) in enumerate(true_pred):
        print(f'{i} {flower_labels[pred.numpy()]}\t\t{flower_labels[np.argmax(true)]}')

In [189]:
find_clusters()

##### Evaluate accuracy

In [178]:
y_pred = predict()

k_lables:  (23, 5, 3)


In [179]:
acc = np.sum(y_pred.numpy() == np.argmax(y_test, axis=1)) / y_test.shape[0]

In [180]:
acc

0.2608695652173913