In [1]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Functions

In [2]:
def standard_units(x):
    return (x - np.mean(x))/np.std(x)

def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(np.array(list(new_point)), np.array(list(row)))
    return attributes.apply(distance_from_point)

def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))

def closest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

def majority(topkclasses):
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

## Counterfeit Currency

In [3]:
banknotes = Table.read_table('banknote.csv')
banknotes

WaveletVar,WaveletSkew,WaveletCurt,Entropy,Class
3.6216,8.6661,-2.8073,-0.44699,0
4.5459,8.1674,-2.4586,-1.4621,0
3.866,-2.6383,1.9242,0.10645,0
3.4566,9.5228,-4.0112,-3.5944,0
0.32924,-4.4552,4.5718,-0.9888,0
4.3684,9.6718,-3.9606,-3.1625,0
3.5912,3.0129,0.72888,0.56421,0
2.0922,-6.81,8.4636,-0.60216,0
3.2032,5.7588,-0.75345,-0.61251,0
1.5356,9.1772,-2.2718,-0.73535,0


In [4]:
banknotes.group('Class')

Class,count
0,762
1,610


### Standardize the data

In [5]:
# Standardize data

banknotes_SU = Table().with_columns(
    'WaveletVar SU', standard_units(banknotes.column('WaveletVar')),
    'WaveletSkew SU', standard_units(banknotes.column('WaveletSkew')),
    'WaveletCurt SU', standard_units(banknotes.column('WaveletCurt')),
    'Class', banknotes.column('Class'))

banknotes_SU

WaveletVar SU,WaveletSkew SU,WaveletCurt SU,Class
1.12181,1.14946,-0.97597,0
1.44707,1.06445,-0.895036,0
1.20781,-0.777352,0.122218,0
1.06374,1.29548,-1.2554,0
-0.0367718,-1.08704,0.73673,0
1.3846,1.32087,-1.24365,0
1.11111,0.185881,-0.155217,0
0.583612,-1.48841,1.64002,0
0.974571,0.653913,-0.499268,0
0.387745,1.23657,-0.85168,0


### Split data into training and testing tables

In [6]:
banknotes.num_rows

1372

In [7]:
# 70%/30% for Training/Testing 

banknotes.num_rows * 0.70

960.4

In [8]:
shuffled_banknotes_SU = banknotes_SU.sample(with_replacement=False) 
banknotes_SU_train = shuffled_banknotes_SU.take(np.arange(960))
banknotes_SU_test  = shuffled_banknotes_SU.take(np.arange(960, 1372))

In [9]:
banknotes_SU_test.group('Class')

Class,count
0,224
1,188


### Classify data

In [10]:
attributes = banknotes_SU_test.drop('Class')

classify(banknotes_SU_train, attributes.row(7), 9)

0

How to choose K? That's complicated...

### Determine the accuracy of the Classifier

In [11]:
evaluate_accuracy(banknotes_SU_train, banknotes_SU_test, 9)

1.0

This accuracy will depend on the randomly selected training and testing datasets. Let's assume that the accuracy is 99%.

### If we test a random bill and classify it as counterfeit, what is the probability that it is _actually_ counterfeit?

P(counterfeit) = 0.0001    
P(Not counterfeit) = 0.9999

__Assume that our accuracy if for both detecting counterfeit and non counterfeit bills__

P(test+ | counterfeit) = 0.99    
P(test- | counterfeit) = 0.01    

P(test+ | not counterfeit) = 0.01    
P(test- | counterfeit) = 0.99   

In [12]:
(0.0001 * 0.99) / ((0.0001 * 0.99) + (0.9999 * 0.01)) 

0.00980392156862745