In [70]:
%matplotlib inline

from sklearn import datasets
import operator
import numpy as np
import matplotlib.pyplot as plt
import math

iris = datasets.load_iris()
X = iris.data 
Y = iris.target

def split_train_test_1(X,test_size,rand_seed):
    np.random.seed(rand_seed)
    indices = np.random.permutation(X.shape[0]) # store the indices
    testLen = np.floor(test_size * X.shape[0]).astype(int) # get the middle point where the data is split
    idxTrain = indices[:-testLen] # store the indices above the middle point as indices for training data
    idxTest = indices[-testLen:] # store the indices below the middle point as indices for training data
    assert not np.intersect1d(idxTest,idxTrain)
    assert len(idxTrain) + len(idxTest) == X.shape[0]
    return (idxTrain,idxTest) # return the indices that points to the data as two lists

def euclideanDistance(in1, in2, length):
    distance = 0
    for x in range(length):
        distance += pow((in1[x] - in2[x]), 2)
    return math.sqrt(distance)

def normDistance(in1, in2, length):
    distance = 0
    for x in range(length):
        distance += np.linalg.norm((in1[x] - in2[x]))
    return distance

def manhattanDistance(in1, in2, length):
    distance = 0
    for x in range (length):
        distance += abs(in1[x] - in2[x])
    return distance

def getNeighbours(t,X,n,T): # where T is number of data
    distances = []    
    
    # calculate the distance between each testing set row and every training set row
    # uncomment the code below depending on which distance function you want to use
    for i, instance in enumerate(X[:T]):
        distance = euclideanDistance(t, instance, 4)
        #distance = manhattanDistance(t, instance, 4)
        #distance = normDistance(t, instance, 4)
        distances.append((distance, i)) # append the calculated distance to the distances array
        
    distances.sort() # sort the distances array by lowest-highest
    
    neighbours = [] # create an empty neighbours array where nearest neighbours will be stored
    for i in distances[:n]:
        neighbours.append(i[1]) # append the indices of k nearest neighbours for each test row
    return neighbours # return the filled neighbours array

def assignLabel(nLabels):
    label_counts = np.bincount(nLabels) # count the number of occurences of each class
    return np.argmax(label_counts) # return the most common class (max number of occurences)

def getConfusionMatrix(predictedC, actualC):
    cf = np.zeros((3,3), dtype=np.int)
    
    # Generate the confusion matrix
    for i,j in zip(actualC, predictedC):
            cf[i][j] += 1
    return cf # return the list

def getAccuracy(predictedC, actualC):
    correct = 0
    for (pred, actual) in zip(predictedC, actualC): # loop through predicted and actual classes  
        if (pred == actual): # check number of correct predictions based on actual class
            correct += 1
    accuracy = float(correct * 100) / len(actualC) # calculate the accuracy (num. of correctly predicted classes / length)
    return accuracy

idxTrain, idxTest = split_train_test_1(X,0.2,0) # split the data into training and testing and store its indices
 
k = 3 # specify the number of nearest neighbours

Xtrain = X[idxTrain] # load the data/features of the training set
Ytrain = Y[idxTrain] # load the labels of the training sets
Xtest = X[idxTest] # load the data/features of the testing set
Ytest = Y[idxTest] # load the labels of the testing set

predictions = [] # declare an empty list where predictions will be later stored

for i, j in enumerate(Xtest): # for each data row in the test set
    neighbours = getNeighbours(j, Xtrain, k, Xtrain.shape[0]) # get the nearest neighbours and store it in a new neigbours array
    pred = assignLabel(Ytrain[neighbours]) # append the most common label based on nearest neighbours
    predictions.append(pred) # append the prediction to the predictions array

accuracy = getAccuracy(predictions, Ytest) # store the accuracy (float)

print "Value of k is: " + str(k) # print the value of k
cf = getConfusionMatrix(predictions, Ytest) # store the generated confusion matrix

print "\n" + "Confusion Matrix" # print the confusion matrix
for row in (cf): print row
    
print "\n" + "Accuracy is: " + str(accuracy) # print the accuracy

Value of k is: 3

Confusion Matrix
[10  0  0]
[0 9 2]
[0 0 9]

Accuracy is: 93.3333333333


In [182]:
                        # ---------------- function start here -------------------------------#

def getNeighbours2 (t,X,n,T,dist): # where T is number of data
    distances = []    
    
    # calculate the distance between each testing set row and every training set row
    # uncomment the code below depending on which distance function you want to use
    for i, instance in enumerate(X[:T]):
        distance = dist(t, instance, 4)
        #distance = manhattanDistance(t, instance, 4)
        #distance = normDistance(t, instance, 4)
        distances.append((distance, i)) # append the calculated distance to the distances array
        
    distances.sort() # sort the distances array by lowest-highest
    
    neighbours = [] # create an empty neighbours array where nearest neighbours will be stored
    for i in distances[:n]:
        neighbours.append(i[1]) # append the indices of k nearest neighbours for each test row
    return neighbours # return the filled neighbours array

def cross_val_1(dataset, k, rand_seed, add_noise):
    np.random.seed(rand_seed)
    indices = np.random.permutation(dataset.data.shape[0])  
    X = dataset.data[indices,:]
    Y = dataset.target[indices]
    
    if add_noise == True: # if variable add_noise is set to true, execute
         X = X + np.random.normal(0,0.5, X.shape) 

    num_instances = X.shape[0] # store the total number of rows
    foldLength = num_instances / k # divide to get the number of folds to do

    folds = [] # declare an empty folds array that will later be used
    
    Xtrain, Ytrain, Xtest, Ytest = [], [], [], []
    
    for i in range(k): # for the range of k (num. of folds), store the indices that will be used as part of test data
        idxTest = np.arange((i * foldLength) , (i * foldLength + foldLength))
        
        Xtest.append(X[idxTest,:]) # append the actual data of the indices
        Ytest.append(Y[idxTest]) # append the labels of the indices
        Xtrain.append(np.delete(X, idxTest, axis=0)) # append the rest of the data into a train list
        Ytrain.append(np.delete(Y, idxTest)) # append the rest of the labels into a train list
        
        folds.append((Xtrain[i], Ytrain[i], Xtest[i], Ytest[i])) #a ppend the data for each fold
    return folds

def performCV(CVfolds, numDistFunctions, minNeighbours, maxNeighbours):
    accuracies = [] # declare empty list where accuracies will later be stored
    
    for dist in numDistFunctions: # for each type of distance (default 3)
        avg_CV_accuracy = [] # declare empty list where average accuracy will be stored

        for k in range(minNeighbours, maxNeighbours + 1):
            currFoldAccuracy = [] # declare empty list to store current folds accuracy
            
            for fold in CVfolds: # for each fold, run
                # Prepare training and test set
                Xtrain, Ytrain, Xtest, Ytest = fold
                
                predictions = [] # declare empty list to store predictions

                for i, j in enumerate(Xtest): # for each test row, loop     
                    neighbours = getNeighbours2(j, Xtrain, k, Xtrain.shape[0], dist) # get the k nearest neighbours
                    pred = assignLabel(Ytrain[neighbours]) # append the label to that test row depending on most common class
                    predictions.append(pred) # append the predicted class to predictions array
   
                currFoldAccuracy.append(getAccuracy(predictions, Ytest)) # get the overall accuracy for this k and append
            accuracies.append(currFoldAccuracy) # append each accuracy 
    return accuracies

def getBestParams(accuracies, distance_functions, maxNeighbours):
    best_params = {}
    
    for i, dist in enumerate(distance_functions): 

        bestK = -1
        highestAccuracy = 0
        
        print dist.__name__ + "\n"
        for j in range(0, maxNeighbours):

            foldAccuracy = accuracies[i*j + j] # store the accuracy for each fold
            avgAccuracy = np.mean(foldAccuracy, axis=0) # store the avg. accuracy of all folds
            stdAccuracy = np.std(foldAccuracy, axis=0) # store the std. deviation of all folds

            if highestAccuracy < avgAccuracy:
                highestAccuracy = avgAccuracy
                bestK = j+1 
            
            print "Value of k is: " + str(j+1)
            print "Each fold's accuracy is: " + str(foldAccuracy)
            print "Avg. Accuracy is: " + str(avgAccuracy)
            print "Std. deviation is: " + str(stdAccuracy)
            print "\n"
            
        best_params[dist] = bestK
    return best_params

                           # ---------------- function end here -------------------------------#

iris = datasets.load_iris() # load the dataset
foldNum = 5 # specify the number of folds to do
maxNeighbours = 10 # specify the max num. of neighbours (range between min and max) to do folds on
distance_functions = [euclideanDistance, manhattanDistance] # append different distance names (name of functions) to use
ADD_NOISE = False # specify whether to add noise (True or False)
CVfolds = cross_val_1(iris, foldNum, 10, ADD_NOISE) # store the returned split data for each fold
accuracies = performCV(CVfolds, distance_functions, 1, maxNeighbours) # store the returned accuracies for each fold and k

best_params = getBestParams(accuracies, distance_functions, maxNeighbours)

iris = datasets.load_iris()
X = iris.data # store data in X
Y = iris.target # store labels in Y

idxTrain, idxTest = split_train_test_1(X, 0.2, 0) #split data and return the indices (80% and 20% respectively)

Xtrain = X[idxTrain,:]
Ytrain = Y[idxTrain]
Xtest = X[idxTest,:]
Ytest = Y[idxTest]
print "\n \n Best parameters"

for (dist, k) in best_params.items():
    predictions = [] # empty list to store the predictions
    
    for i, j in enumerate(Xtest):     
        neighbours = getNeighbours2(j, Xtrain, k, Xtrain.shape[0], dist) # get the k nearest neighbours
        pred = assignLabel(Ytrain[neighbours]) # assign the label based on most common class
        predictions.append(pred) # append the prediction
        
    print("\n*******************************************************************************************************\n")
    print("Distance is: " + dist.__name__)
    print("Value of k is: " + str(n))
    print("Is noise added:" + str(ADD_NOISE))
    cf = getConfusionMatrix(predictions, Ytest)
    print "\n" + "Confusion Matrix" # print the confusion matrix
    for row in (cf): print row
    print("\n" + "Accuracy is: " + str(getAccuracy(predictions, Ytest)))


euclideanDistance

Value of k is: 1
Each fold's accuracy is: [96.66666666666667, 96.66666666666667, 90.0, 96.66666666666667, 100.0]
Avg. Accuracy is: 96.0
Std. deviation is: 3.26598632371


Value of k is: 2
Each fold's accuracy is: [96.66666666666667, 96.66666666666667, 90.0, 90.0, 96.66666666666667]
Avg. Accuracy is: 94.0
Std. deviation is: 3.26598632371


Value of k is: 3
Each fold's accuracy is: [96.66666666666667, 96.66666666666667, 93.33333333333333, 96.66666666666667, 100.0]
Avg. Accuracy is: 96.6666666667
Std. deviation is: 2.10818510678


Value of k is: 4
Each fold's accuracy is: [96.66666666666667, 96.66666666666667, 93.33333333333333, 93.33333333333333, 100.0]
Avg. Accuracy is: 96.0
Std. deviation is: 2.49443825785


Value of k is: 5
Each fold's accuracy is: [96.66666666666667, 96.66666666666667, 93.33333333333333, 96.66666666666667, 96.66666666666667]
Avg. Accuracy is: 96.0
Std. deviation is: 1.33333333333


Value of k is: 6
Each fold's accuracy is: [96.66666666666667, 96.66