In [130]:
%matplotlib inline

from sklearn import datasets
import operator
import numpy as np
import matplotlib.pyplot as plt
import math

iris = datasets.load_iris()

X = iris.data 
Y = iris.target

def split_train_test_1(X,test_size,rand_seed):
    np.random.seed(rand_seed)
    indices= np.random.permutation(X.shape[0])
    testLen=np.floor(test_size * X.shape[0]).astype(int)
    idxTrain=indices[:-testLen]
    idxTest=indices[-testLen:]
    assert not np.intersect1d(idxTest,idxTrain)
    assert len(idxTrain) + len(idxTest) == X.shape[0]

    return (idxTrain,idxTest)

def euclideanDistance(in1, in2, length):
    distance = 0
    for x in range(length):
        distance += pow((in1[x] - in2[x]), 2)
    return math.sqrt(distance)

def normDistance(in1, in2, length):
    distance = 0
    for x in range(length):
        distance += np.linalg.norm((in1[x] - in2[x]))
    return distance

def manhattanDistance(in1, in2, length):
    distance = 0
    for x in range (length):
        distance += abs(in1[x] - in2[x])
    return distance

def getNeighbours(t,X,k,T):
    distances = []    
    
    # Calculate distance between test point and each point/instance in X
    for i, instance in enumerate(X[:T]):
        distance = euclideanDistance(t, instance, 4)
        distances.append((distance, i))
        
    # Sort the distances in ascending order
    distances.sort()
    
    neighbours = [] # neighbours array to store nearest neighbours
    for i in distances[:k]: # for the length of k (nearest neighbours)
        neighbours.append(i[1]) # append the closest neighbours to neighbours array
    return neighbours


def assignLabel(nLabels):    
    # Create bins for each label: (label, count)
    labelOccur = np.bincount(nLabels)
    
    # return the label that occurs the most
    return np.argmax(labelOccur)

# Accuracy
def getAccuracy(predictedC, actualC):
    correct = 0
    for (pred, actual) in zip(predictedC, actualC): 
        if (pred == actual): # if predicted equals to the actual class, increase correct by one
            correct += 1
    accuracy = float(correct) / len(actualC)
    return accuracy

def getConfusionMatrix(predictedC, actualC):
    cf = np.zeros((3,3), dtype=np.int)
    
    # Generate the confusion matrix
    for i,j in zip(actualC, predictedC):
            cf[i][j] += 1
    return cf # return the list

idxTrain, idxTest = split_train_test_1(X,0.2,0) # split the data into training and testing and store its indices
 
Xtrain = X[idxTrain] # load the data/features of the training set
Ytrain = Y[idxTrain] # load the labels of the training sets
Xtest = X[idxTest] # load the data/features of the testing set
Ytest = Y[idxTest] # load the labels of the testing set

# Number of nearest neighbours to consider
k = 5

predictions = [] # declare an empty list where predictions will be later stored

for i, j in enumerate(Xtest): # for each data row in the test set, loop
    # get the nearest neighbours and store it in a new neigbours array
    neighbours = getNeighbours(j, Xtrain, k, Xtrain.shape[0])
        
    pred = assignLabel(Ytrain[neighbours]) # append the label based on the prediction
    predictions.append(pred) # append the prediction to the predictions array

# get the accuracy between the predictions and the actual data and store the returned value
# in a variable accuracy
accuracy = getAccuracy(predictions, Ytest)

print "Value of k is: " + str(k) # print the value of k

# get the confusion matrix based on predictions and the actual data and store the returned value
# in a list cf
cf = getConfusionMatrix(predictions, Ytest) # get the confusion matrix and 

# print the confusion matrix
print "\n" + "Confusion Matrix"
for row in (cf):
    print str(row)

# print the accuracy
print "\n" + "Accuracy is: " + str(accuracy)

Value of k is: 5

Confusion Matrix
[10  0  0]
[0 9 2]
[0 0 9]

Accuracy is: 0.933333333333
