# K-Nearest Neighbors Classifier for a Binary Classification Problem

In [63]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Mathematical Functions

- Convert an array of integers and/or floats to standard units
- Find the Euclidean distance between two arrays of integers and/or floats
- Find the Euclidean distance between two rows containing integers and/or floats
- Find the Euclidean distance between a row of unclassified data and each row in a table of classified data

In [64]:
def standard_units(arr):
    '''Take an array containing integers or floats
    Return in standard units'''
    return (arr - np.mean(arr))/np.std(arr)

def distance(arr1, arr2):
    '''Take two arrays containing integers or floats
    Return the Euclidean distance between the two arrays'''
    return np.sqrt(sum((arr1 - arr2)**2))

def row_distance(row1, row2):
    '''Take two rows containing integers or floats
    Return the Euclidean distance between the two rows'''
    return distance(np.array(row1), np.array(row2))

def distances(training_data_table, testing_data_row):
    '''Take a table of classified (training) data and a row from a table of unclassified (testing) data
    Return the training table with a column for Euclidean distance from the test row'''
    distances = make_array()
    attributes_only = training_data_table.drop('Class')
    
    for i in np.arange(attributes_only.num_rows):
        training_data_table_row = attributes_only.row(i)
        distances = np.append(distances, row_distance(training_data_table_row, testing_data_row))
        
    return training_data_table.with_column('Distance', distances)

## Classification Functions

- Find the k-nearest rows from a table of classified data to a row of unclassified data
- Find the majority classification of the k-nearest rows from a table of classified data to a row of unclassified data
- Find the classification of a row of unclassified data based on the majority classification of the k-nearest rows from a table of classified data 

In [65]:
def closest(classified_table, unclassified_row, k):
    '''Take a table of classified data, a row of unclassified data, and an integer (k)
    Return the k-nearest rows from the table of classified data to the row of unclassified data'''
    return distances(classified_table, unclassified_row).sort('Distance').take(np.arange(k))

def majority_class(k_nearest_rows):
    '''Take the k-nearest rows from a table of classified data to a row of unclassified data
    Return the majority classification'''
    return k_nearest_rows.group('Class').sort('count', descending=True).column(0).item(0)

def classify(classified_table, unclassified_row, k):
    '''Take a table of classified data, a row of unclassified data, and an integer (k)
    Return the expected classification of the unclassified row based on the majority classification of the k-nearest rows from the classified table''' 
    return majority_class(closest(classified_table, unclassified_row, k))

## Evaluation Functions
- Find the overall accuracy of the k-nearest neighbors classifier defined as classify()
- Find the class-specific accuracy of the k-nearest neighbors classifier defined as classify()

In [66]:
def evaluate_accuracy(training_table, testing_table, k):
    '''Take a table of training data, a table of testing data, and an integer (k)
    Return the proportion of rows from the testing table correctly classified by classify()'''
    num_correct = 0
    
    for i in np.arange(testing_table.num_rows):
        c = classify(training_table, testing_table.drop('Class').row(i), k)
        num_correct = num_correct + (c == testing_table.column('Class').item(i))
    
    return num_correct / testing_table.num_rows

def evaluate_accuracy_by_class(training_table, testing_table, k):
    '''Take a table of training data, a table of testing data, and an integer (k)
    Return the proportion of rows from the testing table correctly classified by classify() for each class'''
    
    num_c0_correct = 0
    num_c1_correct = 0
    
    for i in np.arange(testing_table.num_rows):
        c = classify(training_table, testing_table.drop('Class').row(i), k)
        if testing_table.column('Class').item(i) == 0:
            num_c0_correct = num_c0_correct + (c == testing_table.column('Class').item(i))
        if testing_table.column('Class').item(i) == 1:
            num_c1_correct = num_c1_correct + (c == testing_table.column('Class').item(i))
    
    prop_c0_correct = num_c0_correct / testing_table.where('Class', 0).num_rows
    prop_c1_correct = num_c1_correct / testing_table.where('Class', 1).num_rows
    
    return Table().with_columns(
        'Class 0 Accuracy', make_array(prop_c0_correct), 
        'Class 1 Accuracy', make_array(prop_c1_correct)
    )

## Example: Classification of Currency as Counterfeit or Legitimate

The "banknotes" table has five columns: WaveletVar, WaveletSkew, WaveletCurt, Entropy, and Class
- Rows represent banknotes
- Columns WaveletVar-Entropy represent banknote attributes as integers or floats
- Column Class represents banknote legitimacy as 0-counterfeit or 1-legitimate

In [67]:
banknotes = Table.read_table('banknote.csv')

The standard_units() function is used to create a standardized "banknotes" table by converting the attribute columns into standard units.

In [68]:
banknotes_standard_units = Table().with_columns(
    'WaveletVar', standard_units(banknotes.column('WaveletVar')),
    'WaveletSkew', standard_units(banknotes.column('WaveletSkew')),
    'WaveletCurt', standard_units(banknotes.column('WaveletCurt')),
    'Entropy', standard_units(banknotes.column('Entropy')),
    'Class', banknotes.column('Class'))

The standardized "banknotes" table is shuffled and split into a table of training data and a table of testing data. 
- 80 percent of the rows from the "banknotes" table are allocated for training
- 20 percent of the rows from the "banknotes" table are allocated for testing

In [69]:
shuffled_table = banknotes_standard_units.sample(with_replacement=False) 
training_table = shuffled_table.take(np.arange(int(np.round(banknotes.num_rows * 0.80))))
testing_table  = shuffled_table.take(np.arange(int(np.round(banknotes.num_rows * 0.80)), banknotes.num_rows))

The classify() function is applied to classify each row from the testing table based on the majority classification of the nine nearest rows from the training table (k = 9). The overall and class-specific accuracy of the classify() function in this example is evaluated as the proportion of rows from the testing table correctly classified.

In [70]:
evaluate_accuracy(training_table, testing_table, 9)

0.9890510948905109

In [71]:
accuracy_table = evaluate_accuracy_by_class(training_table, testing_table, 9)
accuracy_table

Class 0 Accuracy,Class 1 Accuracy
0.978723,1
