# Implement KNN Classifier

In [1]:
#import packages
import numpy as np
import pandas as pd
from numpy.linalg import inv, norm
import matplotlib.pyplot as plt

#import data
spam_train = pd.read_csv('spam_train.csv')
spam_test = pd.read_csv('spam_test.csv')

#create matrices
spam_train_matrix = np.array(spam_train)
spam_test_matrix = np.array(spam_test)

train_shape = spam_train_matrix.shape
numcol_train = train_shape[1]
numrow_train = train_shape[0]

test_shape = spam_test_matrix.shape
numcol_test = test_shape[1]
numrow_test = test_shape[0]

#create vector y
y_train = spam_train_matrix[:,(numcol_train-1)]
#create x train matrix
x_train = spam_train_matrix[:,:(numcol_train-1)]
#create vector y
y_test = spam_test_matrix[:,(numcol_test-1)]
#create x test matrix
x_test2 = spam_test_matrix[:,1:(numcol_test-1)]

In [2]:
#Euc Distance = sqrt(Sum(A**2 - 2 (A * B) + B**2))

#Create matrix with Euc Distace between test(rows) and train(columns)

def euclidian_distance_efficient(test,train):
    equation_factor = np.sum(np.square(test)[:,np.newaxis,:], axis=2) - 2 * test.dot(train.T) + np.sum(np.square(train), axis=1)
    euc_dist = np.sqrt(equation_factor.astype(float))
    return euc_dist

In [3]:
#create euc distance matrix (Rows = test; Columns = train)
euc_dist = euclidian_distance_efficient(x_test2,x_train)

#Descriptive stats
euc_dist_shape = euc_dist.shape
numcol_euc_dist = euc_dist_shape[1]
numrow_euc_dist = euc_dist_shape[0]

In [4]:
#for each row in test data, find the column(index of training data) with the smallest Euc_Distance
def KNN_accuracy(test,train,k):
    y_values_test_noround = []

    for row in range(numrow_euc_dist):
        list = (euc_dist[row])
        list_index = sorted(range(len(list)), key=lambda k: list[k])
        list_index_k = list_index[:k]

        KNN_output = []

        for i in list_index_k:
            KNN_output.append(y_train[i])

        y_value = np.average(KNN_output)
        y_values_test_noround.append(y_value)

    y_values_test = np.rint(y_values_test_noround)
    
    yay = 0
    nay = 0

    for a,b in zip (y_values_test,y_test):
        if a == b:
            yay += 1
        if a!= b:
            nay += 1

    return(yay / (yay + nay))

In [5]:
k_values = [1, 5, 11, 21, 41, 61, 81, 101, 201, 401]

for k in k_values:
    KNN_acc = KNN_accuracy(x_test2, x_train, k)
    print("K = " + str(k) + " | Accuracy = " + str(round(KNN_acc,4)))

K = 1 | Accuracy = 0.7523
K = 5 | Accuracy = 0.7549
K = 11 | Accuracy = 0.7649
K = 21 | Accuracy = 0.7466
K = 41 | Accuracy = 0.7523
K = 61 | Accuracy = 0.7375
K = 81 | Accuracy = 0.7266
K = 101 | Accuracy = 0.7288
K = 201 | Accuracy = 0.7314
K = 401 | Accuracy = 0.7197


In [6]:
#normalizing the data
from scipy import stats

x_train_norm = stats.zscore(x_train)

In [7]:
x_test_norm = stats.zscore(np.float64(x_test2))

In [8]:
#Euc Distance = sqrt(Sum(A**2 - 2 (A * B) + B**2))

#Create matrix with Euc Distace between test(rows) and train(columns)

def euclidian_distance_efficient(test,train):
    equation_factor = np.sum(np.square(test)[:,np.newaxis,:], axis=2) - 2 * test.dot(train.T) + np.sum(np.square(train), axis=1)
    euc_dist = np.sqrt(equation_factor.astype(float))
    return euc_dist

In [18]:
#create euc distance matrix (Rows = test; Columns = train)
euc_dist_norm = euclidian_distance_efficient(x_test_norm,x_train_norm)

print(type(euc_dist_norm))
print(x_test_norm.shape)
print(type(x_test_norm))


#Descriptive stats
euc_dist_norm_shape = euc_dist_norm.shape
numcol_euc_dist_norm = euc_dist_norm_shape[1]
numrow_euc_dist_norm = euc_dist_norm_shape[0]

<class 'numpy.ndarray'>
(2301, 57)
<class 'numpy.ndarray'>


In [10]:
#for each row in test data, find the column(index of training data) with the smallest Euc_Distance
def KNN_accuracy_norm(test,train,k):
    y_values_test_noround = []

    for row in range(numrow_euc_dist_norm):
        list = (euc_dist_norm[row])
        list_index = sorted(range(len(list)), key=lambda k: list[k])
        list_index_k = list_index[:k]

        KNN_output = []

        for i in list_index_k:
            KNN_output.append(y_train[i])

        y_value = np.average(KNN_output)
        y_values_test_noround.append(y_value)

    y_values_test = np.rint(y_values_test_noround)
    
    yay = 0
    nay = 0

    for a,b in zip (y_values_test,y_test):
        if a == b:
            yay += 1
        if a!= b:
            nay += 1

    return(yay / (yay + nay))

In [11]:
k_values = [1, 5, 11, 21, 41, 61, 81, 101, 201, 401]

for k in k_values:
    KNN_acc = KNN_accuracy_norm(x_test_norm, x_train_norm, k)
    print("K = " + str(k) + " | Accuracy = " + str(round(KNN_acc,4)))

K = 1 | Accuracy = 0.8231
K = 5 | Accuracy = 0.8322
K = 11 | Accuracy = 0.8748
K = 21 | Accuracy = 0.8709
K = 41 | Accuracy = 0.8705
K = 61 | Accuracy = 0.8701
K = 81 | Accuracy = 0.8696
K = 101 | Accuracy = 0.864
K = 201 | Accuracy = 0.8462
K = 401 | Accuracy = 0.8144


In [12]:
def KNN_results(test,train,k,row):
    '''   
        for row in (euc_dist_norm[1]):
            list = row.tolist()
            list_index = sorted(range(len(list)), key=lambda k: list[k])
            list_index_k = list_index[:k]

            KNN_output = []

            for i in list_index_k:
                KNN_output.append(y_train[i])

            return KNN_output


        ###

    '''
    y_values_test_noround = []

    list = (euc_dist_norm[row])
    list_index = sorted(range(len(list)), key=lambda k: list[k])
    list_index_k = list_index[:k]

    KNN_output = []

    for i in list_index_k:
        KNN_output.append(y_train[i])

    y_value = np.average(KNN_output)
    y_values_test_noround.append(y_value)

    y_values_test = np.rint(y_values_test_noround)
    
    #return int(y_values_test)
    
    if int(y_values_test) == 1:
        return('Spam')
    elif int(y_values_test) == 0:
        return('no')
        
    ###
        

In [13]:
rows = list(range(0,50))
k_values = [1, 5, 11, 21, 41, 61, 81, 101, 201, 401]

for r in rows:
    
    KNN_res = []
    
    for k in k_values:
        KNN_res.append(KNN_results(x_test_norm, x_train_norm, k, r))
    
    print("t" + str(r + 1) + ": " + str(KNN_res))

t1: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'no', 'no', 'no', 'no', 'no']
t2: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'no', 'no', 'no']
t3: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t4: ['Spam', 'Spam', 'Spam', 'Spam', 'no', 'no', 'Spam', 'Spam', 'Spam', 'Spam']
t5: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t6: ['Spam', 'Spam', 'Spam', 'no', 'no', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t7: ['Spam', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no']
t8: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t9: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t10: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t11: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t12: ['Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam', 'Spam']
t13: ['Spam', 'Spam