<h1>K Nearest Neighbour</h1>

In [73]:
import pandas as pd
from scipy.io import arff
import numpy
import math
import operator


## Reading the train and test data

In [90]:
#Reading the train and test data
data=arff.loadarff("trainProdSelection.arff")
train_data_df = pd.DataFrame(data[0])

data=arff.loadarff("testProdSelection.arff")
test_data_df = pd.DataFrame(data[0])

In [91]:
list(pd.DataFrame(data[0]))
pd.DataFrame(data[0]).dtypes

Type          object
LifeStyle     object
Vacation     float64
eCredit      float64
salary       float64
property     float64
label         object
dtype: object

<h3>Train Data</h3>

<h3>Pre-Processing train Data</h3>

In [92]:
train_data_df.Type=train_data_df.Type.str.decode("UTF-8")
train_data_df.LifeStyle=train_data_df.LifeStyle.str.decode("UTF-8")
train_data_df.label=train_data_df.label.str.decode("UTF-8")

In [93]:
train_data_df.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,6.0,40.0,13.62,3.2804,C1
1,student,spend>saving,11.0,21.0,15.32,2.0232,C1
2,student,spend>saving,7.0,64.0,16.55,3.1202,C1
3,student,spend>saving,3.0,47.0,15.71,3.4022,C1
4,student,spend>saving,15.0,10.0,16.96,2.2825,C1


# Normalizing the continous data columns  

In [94]:
cols_to_norm = ['Vacation', 'eCredit', 'salary', 'property']
train_data_df[cols_to_norm] = train_data_df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train_data_df.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,0.079365,0.107558,0.21996,0.183167,C1
1,student,spend>saving,0.15873,0.052326,0.293102,0.112797,C1
2,student,spend>saving,0.095238,0.177326,0.346023,0.1742,C1
3,student,spend>saving,0.031746,0.127907,0.309882,0.189984,C1
4,student,spend>saving,0.222222,0.020349,0.363663,0.127311,C1


In [95]:
train_data_df.head()
#The First 5 Rows

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,0.079365,0.107558,0.21996,0.183167,C1
1,student,spend>saving,0.15873,0.052326,0.293102,0.112797,C1
2,student,spend>saving,0.095238,0.177326,0.346023,0.1742,C1
3,student,spend>saving,0.031746,0.127907,0.309882,0.189984,C1
4,student,spend>saving,0.222222,0.020349,0.363663,0.127311,C1


<h3>Test Data</h3>

<h3>Pre-Processing Data</h3>

In [96]:
test_data_df.Type=test_data_df.Type.str.decode("UTF-8")
test_data_df.LifeStyle=test_data_df.LifeStyle.str.decode("UTF-8")
test_data_df.label=test_data_df.label.str.decode("UTF-8")

In [97]:
cols_to_norm = ['Vacation', 'eCredit', 'salary', 'property']
test_data_df[cols_to_norm] = test_data_df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
test_data_df.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend<saving,0.2,0.058824,0.104637,0.398926,C1
1,student,spend>>saving,0.54,0.021008,0.175059,0.243041,C1
2,student,spend<<saving,0.52,0.231092,0.138339,0.085992,C1
3,engineer,spend>saving,0.26,0.151261,0.430086,0.116229,C1
4,librarian,spend<saving,0.0,0.016807,0.352657,0.025714,C1


In [98]:
test_data_df.head()
#The First 5 Rows

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend<saving,0.2,0.058824,0.104637,0.398926,C1
1,student,spend>>saving,0.54,0.021008,0.175059,0.243041,C1
2,student,spend<<saving,0.52,0.231092,0.138339,0.085992,C1
3,engineer,spend>saving,0.26,0.151261,0.430086,0.116229,C1
4,librarian,spend<saving,0.0,0.016807,0.352657,0.025714,C1


<h3>Euclidean Distance</h3>

In [99]:
def euclidean_distance(first, second, length):
    distance = 0
    
    for i in range(2):
        if (first[i]!=second[i]):
            distance += pow((1), 2)
    
    for x in range(2,length):
        distance += pow((first[x] - second[x]), 2)
    
    return math.sqrt(distance)

<h3>Fetch Neighbours</h3>

In [84]:
def get_neighbors(train_Set, test_Instance, k):
    distances = []
    length = len(test_Instance)-1
    
    #Looking for neighbours in train Set for every test entry
    for x in range(len(train_Set)):
        dist = euclidean_distance(test_Instance, train_Set[x], length)
        distances.append((train_Set[x], dist))
    
    #Sorting the result based on distance
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    
    #Taking only the required k entries
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

<h3>Frequency for Each Class</h3>

In [85]:
def get_output(neighbors):
    freq = {}
    #Update a dictionary based on frequency for each and every class
    for x in range(len(neighbors)):
        output = neighbors[x][-1]
        if output in freq:
            freq[output] += 1
        else:
            freq[output] = 1
    output = sorted(freq.items(), key=operator.itemgetter(1), reverse=True)
    return output[0][0]

# <h3>Calculate the Accuracy</h3>

In [86]:
def calculate_accuracy(testSet, predictions):
    cnt = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            cnt += 1
    return (cnt/float(len(testSet))) * 100.0

# <h3>KNN Algorithm</h3>

In [87]:
def knn_algorithm(k):
    prediction=[]
    for x in range(len(test_data_df)):
        neighbors = get_neighbors(train_data_df.values, test_data_df.values[x], k)
        output = get_output(neighbors)
        prediction.append(output)
    accuracy = calculate_accuracy(test_data_df.values, prediction)
    print('Accuracy for k: '+str(k) +"    is "+ repr(accuracy) + '%')
    
    


In [88]:
for i in range(1,71,2):
    knn_algorithm(i)

Accuracy for k: 1    is 23.809523809523807%
Accuracy for k: 3    is 28.57142857142857%
Accuracy for k: 5    is 28.57142857142857%
Accuracy for k: 7    is 28.57142857142857%
Accuracy for k: 9    is 23.809523809523807%
Accuracy for k: 11    is 23.809523809523807%
Accuracy for k: 13    is 19.047619047619047%
Accuracy for k: 15    is 14.285714285714285%
Accuracy for k: 17    is 19.047619047619047%
Accuracy for k: 19    is 19.047619047619047%
Accuracy for k: 21    is 19.047619047619047%
Accuracy for k: 23    is 19.047619047619047%
Accuracy for k: 25    is 19.047619047619047%
Accuracy for k: 27    is 19.047619047619047%
Accuracy for k: 29    is 19.047619047619047%
Accuracy for k: 31    is 19.047619047619047%
Accuracy for k: 33    is 19.047619047619047%
Accuracy for k: 35    is 14.285714285714285%
Accuracy for k: 37    is 14.285714285714285%
Accuracy for k: 39    is 19.047619047619047%
Accuracy for k: 41    is 19.047619047619047%
Accuracy for k: 43    is 19.047619047619047%
Accuracy for k: 45

In [89]:
knn_algorithm(55)

Accuracy for k: 55    is 33.33333333333333%
