In [11]:
# Data Processing Tools
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mping
from collections import Counter

# Scikit Learn Machine Learning Tools
from sklearn.model_selection import train_test_split
from sklearn import svm

# Data Set
from mnist import MNIST

%matplotlib inline

# Suppressing warnings
import warnings
warnings.filterwarnings('ignore')

#### Part 1: Reading in MNIST Data

In [12]:
mndata = MNIST('/Users/johnyang/Desktop/')
mnist_images, mnist_labels = mndata.load_testing()

# Create DataFrame with 784 (28x28 Image) columns and 10000 rows
# Column = feature / color of pixel at specific index
# Row = One instance of data 
mnist_df = pd.DataFrame(mnist_images)

# Add labels corresponding to images as last column in table
mnist_df.insert(loc=0, column='label', value=mnist_labels)

# Separate image pixel values and labels
# Take first 5000 images b/c ain't nobody got time for 10000 images
images = mnist_df.iloc[0:5000, 1:]
labels = mnist_df.iloc[0:5000, :1]

# Randomly separate data into testing and training batches
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, train_size=0.5, random_state=0)

# Cast pandas array types into numpy arrays to make it easier to run computations
train_images_array = train_images.as_matrix()
test_images_array = test_images.as_matrix()
train_labels_array = train_labels.as_matrix()

#### Part 2: K Nearest Neighbors

In [19]:
test_images_len = test_images.shape[0]
predictions = []

batch_size = 250
batches = test_images_len/batch_size

print("Number of batches: " + str(batches))
for i in range(int(batches)):
    # Time of batch processing speed
    tick = time.time()
    
    # Euclidean Distance Calculation
    test_prediction = test_images_array[(i * batch_size):((i+1) * batch_size)]
    dot_product = np.dot(test_prediction, train_images_array.T)
    
    sum_square_test = np.square(test_prediction).sum(axis=1)
    sum_square_train = np.square(train_images_array).sum(axis=1)
    
    distances = np.sqrt(-2 * dot_product + sum_square_train + np.matrix(sum_square_test).T)
    
    num_distances = distances.shape[0]
    
    # Batch Predictions
    label_predictions = np.zeros(num_distances)
    for j in range(num_distances):
        k_closest_y = []
        
        # Labels from points with distance calculated
        calculated_labels = train_labels_array[np.argsort(distances[j,:])].flatten()
        
        # 3 Closest Neighbors
        k_closest_y = calculated_labels[:3]
        
        # Count Unique Neighbors
        counted = Counter(k_closest_y)
        
        label_predictions[j] = counted.most_common(1)[0][0]
    
    predictions = predictions + list(label_predictions)
    
    tock = time.time()
    
    print("Completed batch in " + str(tock - tick) + " Seconds.")

Number of batches: 10.0
Completed batch in 3.3738150596618652 Seconds.
Completed batch in 3.284066915512085 Seconds.
Completed batch in 3.172995090484619 Seconds.
Completed batch in 3.1866250038146973 Seconds.
Completed batch in 3.2541251182556152 Seconds.
Completed batch in 3.3960750102996826 Seconds.
Completed batch in 3.3485047817230225 Seconds.
Completed batch in 3.14313006401062 Seconds.
Completed batch in 3.157855987548828 Seconds.
Completed batch in 3.200269937515259 Seconds.


In [23]:
prediction_array = np.asarray(predictions)

correct = 0
count = 0
for index, row in test_labels.iterrows():
    actual_label= int(row['label'])
    if (actual_label == prediction_array[count]):
        correct += 1
    count += 1

print(float(correct) / float(count))

0.9116
