In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import datetime
from scipy.io import arff

In [2]:
k_fold = 10
data = arff.loadarff('kc2.arff')
df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,b'no'
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,b'yes'
2,415.0,59.0,50.0,51.0,1159.0,8411.31,0.01,103.53,81.24,870848.58,...,359.0,35.0,9.0,10.0,47.0,106.0,692.0,467.0,106.0,b'yes'
3,230.0,33.0,10.0,16.0,575.0,3732.82,0.03,39.82,93.74,148644.06,...,174.0,15.0,34.0,5.0,23.0,67.0,343.0,232.0,65.0,b'yes'
4,175.0,26.0,12.0,13.0,500.0,3123.96,0.03,29.48,105.96,92103.07,...,142.0,7.0,19.0,4.0,18.0,58.0,310.0,190.0,51.0,b'yes'


In [3]:
def euclidean_distance(x1, x2):
    return np.sqrt(((x1 - x2)**2).sum())

In [4]:
def nn(x_train, y_train, xq):
    distances = []
    
    for i, x in enumerate(x_train):
        d = euclidean_distance(x, xq)
        distances.append([d, y_train[i]])
        
    sorted_distances = sorted(distances, key=lambda z:z[0])
    neighbors = np.asarray(sorted_distances)
    
    return neighbors

In [5]:
def knn(x_train, y_train, xq, k):
    pred = []
    
    neighbors = nn(x_train, y_train, xq)
    freq = np.unique(neighbors[:k,-1], return_counts=True)
    pred.append(freq[0][freq[1].argmax()])
        
    return pred

In [6]:
def weighted_knn(x_train, y_train, xq, k):
    classes = {}
    weights = []
    distances = nn(x_train, y_train, xq)
    neighbors = distances[:k, :]
    for n in neighbors:
        if (float(n[0]) != 0):
            weights.append([1/float(n[0]),n[1]])
        else:
            weights.append([999999999,n[1]])
    for w in weights:
        if (w[-1] in classes):
            classes[w[-1]] += w[0]
        else:
            classes[w[-1]] = w[0]
            
    return max(classes, key=classes.get)

In [7]:
def cross_validation(data, n_fold, k, with_weight=False):
    size = data.shape[0]
    k_size = int(size / n_fold)
    x = [data[i:i+k_size,:-1] for i in range(0, size, k_size)]
    y = [data[i:i+k_size,-1] for i in range(0, size, k_size)]
    acc = []
    
    for i in range(n_fold):
        preds = []
        x_train = np.asarray(x[:i] + x[i+1:])
        y_train = np.asarray(y[:i] + y[i+1:])
        x_train = np.reshape(x_train, (size-k_size,-1))
        y_train = np.reshape(y_train, size-k_size)
        x_test = np.asarray(x[i])
        y_test = np.asarray(y[i])
        if (with_weight == True):
            for i, xq in enumerate(x_test):
                preds.append(weighted_knn(x_train, y_train, xq, k))
        else:
            for i, xq in enumerate(x_test):
                preds.append(knn(x_train, y_train, xq, k))
        
        preds = np.asarray(preds)
        preds = np.reshape(preds, k_size)
        acc.append(100*float((preds == y_test).sum()) / preds.shape[0])
    
    acc = np.asarray(acc)
    return acc.sum() / acc.shape[0]

In [None]:
data = df.values[:df.shape[0] - 2]
np.random.shuffle(data)
print(data.shape)

k_values = [1,2,3,5,7,9,11,13,15]
accs = []
weighted_accs = []

print("---------------------KNN---------------------")
for k in k_values:
    acc = cross_validation(data, 10, k)
    accs.append(acc)
    print("The accuracy for k = {:d} is: {:.2f}%".format(k, acc))

print("\n---------------------WEIGHTED KNN---------------------")
for k in k_values:
    acc = cross_validation(data, 10, k, with_weight=True)
    weighted_accs.append(acc)
    print("The accuracy for k = {:d} is: {:.2f}%".format(k, acc))

(520, 22)
---------------------KNN---------------------
The accuracy for k = 1 is: 75.19%
The accuracy for k = 2 is: 80.58%
The accuracy for k = 3 is: 81.35%
The accuracy for k = 5 is: 81.54%
The accuracy for k = 7 is: 82.12%
The accuracy for k = 9 is: 83.08%
The accuracy for k = 11 is: 82.69%
The accuracy for k = 13 is: 82.31%
The accuracy for k = 15 is: 82.69%

---------------------WEIGHTED KNN---------------------
The accuracy for k = 1 is: 75.19%
The accuracy for k = 2 is: 76.15%
The accuracy for k = 3 is: 78.65%
The accuracy for k = 5 is: 78.85%


In [None]:
plt.xlabel('k neighbors')
plt.ylabel('Accuracy (%)')
plt.title('k-NN performance')
plt.bar(np.arange(len(k_values)), accs, align='center', alpha=0.5)
plt.xticks(np.arange(len(k_values)), k_values)
plt.show()