In [34]:
import pandas as pd
import numpy as np
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import random
import math
import operator


data_2C = pd.read_csv("column_2C_weka.csv")
data_2C.head()
data_2C.describe().transpose()
data_2C.dtypes
colnames_numeric = data_2C.columns[0:6]

#Scaling a data in always a good idea while using KNN
scaler = MinMaxScaler()
data_2C[colnames_numeric] = scaler.fit_transform(data_2C[colnames_numeric])
data_2C.head()
data_2C.shape
df = data_2C.values.tolist()

#Breaking the data into training and test set
def train_test_split(data, split, trainingSet = [], testSet = []):
    for x in range(len(data)):
        if random.random() < split:
            trainingSet.append(data[x])
        else:
            testSet.append(data[x])
trainingSet = []
testSet = []
split = 0.66
train_test_split(df, split, trainingSet, testSet)
len(trainingSet)
len(testSet)

#Define Euclidean distances
def Euclideandist(x,xi, length):
    d = 0.0
    for i in range(length):
        d += pow(float(x[i])- float(xi[i]),2)
    return math.sqrt(d)

#Getting the K neighbours having the closest Euclidean distance to the test instance
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = Euclideandist(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

#After sorting the neighbours based on their respective classes, max voting to give the final class of the test instance
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)#Sorting it based on votes
    return sortedVotes[0][0] #Please note we need the class for the top voted class, hence [0][0]#

#Getting the accuracy
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

# generate predictions
predictions=[]
k = 3
for x in range(len(testSet)):
    neighbors = getNeighbors(trainingSet, testSet[x], k)
    result = getResponse(neighbors)
    predictions.append(result)
    print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

#Implementing Naive Bayes using scikitlearn
trainingSet2 = pd.DataFrame(np.array(trainingSet).reshape(len(trainingSet),7), columns = data_2C.columns)
testSet2 = pd.DataFrame(np.array(testSet).reshape(len(testSet),7), columns = data_2C.columns)
trainingSet2.head()
trainingSet2.dtypes

#Even the numeric terms have been converted into an object. Hence need to reconvert
trainingSet2[colnames_numeric] = trainingSet2[colnames_numeric].apply(pd.to_numeric, errors = 'coerce', axis = 0)
trainingSet2.dtypes
testSet2[colnames_numeric] = testSet2[colnames_numeric].apply(pd.to_numeric, errors = 'coerce', axis = 0)
testSet2.dtypes

knn = KNeighborsClassifier(n_neighbors = 3)
x_train,y_train = trainingSet2.loc[:,trainingSet2.columns != 'class'], trainingSet2.loc[:,'class']
x_test,y_test = testSet2.loc[:,testSet2.columns != 'class'], testSet2.loc[:,'class']
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
print('Prediction: {}'.format(prediction))
print('With KNN (K=3) accuracy is: ',knn.score(x_test,y_test)) # accuracy

> predicted='Abnormal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Normal', actual='Abnormal'
> predicted='Abnormal', actual='Abnormal'
> predicted='Normal'