In [2]:
import pandas as pd
import arff
import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from knn import knn

In [3]:
# %load knn.py
from classifier import classifier
from scipy.spatial import distance

class knn(classifier):
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, Y):
        self.x = X
        self.y = Y

    def predict(self, X):
        hypothesis = []
        for i, row_i in X.iterrows():
            points = self.get_sorted_points(row_i)
            neighbors = self.get_top(self.k, points)
            hyp = self.majority_class(neighbors)
            hypothesis.append(hyp)
        return hypothesis

    def get_sorted_points(self, test_x_row):
        points = []
        for j, row_j in self.x.iterrows():
            i_vals = [int(v) for v in test_x_row.values]
            j_vals = [int(v) for v in row_j.values]
            dst = distance.euclidean(i_vals, j_vals)
            points.append((j, dst))
        points.sort(key=lambda tup: tup[1])
        return points

    def get_top(self, n, arr):
        return arr[:n]

    def majority_class(self, arr):
        classes = []
        for tup in arr:
            classes.append(self.y[tup[0]])

        most = classes[0]
        curr_most_count = 0
        for val in classes:
            if classes.count(val) > curr_most_count:
                most = val
                curr_most_count = classes.count(val)
        return most


In [4]:
def getXandY_from_arff(fileName):
    arffData = arff.load(open(fileName, 'r'))
    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(len(arffData["attributes"]))

    attrs_list = []
    for attrMeta in arffData["attributes"]:
        attrs_list.append(attrMeta[0])

    df = pd.DataFrame(data=arffData['data'], columns=attrs_list)
    df.head()
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    return X, y

In [5]:
def runKnn(X, y, start=2, end=32):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    for k in range(start, end+1):
        model = knn(k)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        print("k: " + str(k) + ", accuracy: " + str(accuracy_score(pred, y_test)))

In [6]:
X, y = getXandY_from_arff("PhishingData.arff")
runKnn(X, y, 2, 32)

k: 2, accuracy: 0.874538745387
k: 3, accuracy: 0.885608856089
k: 4, accuracy: 0.892988929889


KeyboardInterrupt: 