# Exploring Multiprocessing Using MNIST Dataset

## Import MNIST Data

Source:  http://yann.lecun.com/exdb/mnist/

In [119]:
import numpy as np
import matplotlib.pyplot as plt
import mnist
from datetime import datetime

# Import images and labels
images_train = mnist.train_images()
images_train_labels = mnist.train_labels()
images_test = mnist.test_images()
images_test_labels = mnist.test_labels()

# Scale testing and training images
images_test = images_test/255
images_train = images_train/255

print("Dimensions of training data:", images_train.shape)
print("Dimensions of training labels:", images_train_labels.shape, "\n")
print("Dimensions of test images:", images_test.shape)
print("Dimensions of test labels:", images_test_labels.shape)

Dimensions of training data: (60000, 28, 28)
Dimensions of training labels: (60000,) 

Dimensions of test images: (10000, 28, 28)
Dimensions of test labels: (10000,)


In [120]:
import numpy as np
import logging

class KnnClassifier(object):
    @staticmethod   
    def euclidean(x,y):
        """
        Inputput
            x: a 1-dimensional numpy array
            y: a 1-dimensional numpy array
        Output
            A single number, the euclidean distance between x and y
        """      

        return(np.sqrt(np.sum((x - y)**2)))
    
    @staticmethod
    def many_distances(x,Y):

        """
        Input
            x: a single point as a numpy array
            Y: a 2-dimensional numpy array
        Output
            a numpy array of euclidean distances
        """
        result = np.zeros(Y.shape[0])
        for idx in range(0, Y.shape[0]):            
            dist = KnnClassifier.euclidean(x, Y[idx])          
            result[idx] = dist  
        return(result)
    
    @staticmethod
    def closest_indices(dists, n):

        """
        Input
            dists: a numpy array of distances (1 dimensional)
            n: the number of lowest values to find
        Output
            a numpy array with the indexes in dists where the
            n lowest values are located.
        """

        arrIndexes = np.argsort(dists)   
        return(arrIndexes[0:n])
    
    @staticmethod
    def get_values_by_index(indices, values):
        """
        Input
            indices: a numpy array of indices
            values: a list of values
        Output
            a list of elements from values whose indexes
            are the values in indices
        """
        arr = np.array(values)
        return(arr[indices])
    
    @staticmethod
    def get_mode(values):
        """
        Input
            values: a lists of values
        Output
            the most common value in the list.
            If there's a tie, break it any way you want to.
        """

        counts = np.unique(values, return_counts=True)
        cnt = np.argsort(counts[1])   
        colorIndex = cnt[len(cnt) -1]
        label = counts[0][colorIndex]   
        return(label)
    
    @staticmethod
    def knn(ind, test_pts, train_pts, labels, k):
        """
        Input
            test: A 2-D numpy array of points (rows) and features (cols)
            train: A 2-D numpy array of points (rows) and features (cols)
            labels: A list of labels associated with train points
        Output
            A list of best guesses for the test labels
        """      
        output = []    
        startIdx = ind[0]
        endIdx = ind[1]
        subsetOfPoints = test_pts[startIdx:endIdx]
       
        for i in range(0,subsetOfPoints.shape[0]):                     
            dists = KnnClassifier.many_distances(subsetOfPoints[i],train_pts)              
            label_indices = KnnClassifier.closest_indices(dists, k)           
            labelSet = KnnClassifier.get_values_by_index(label_indices, labels)          
            predictedLabel = KnnClassifier.get_mode(labelSet)
            output.append(predictedLabel)      
            
        
        return output
    
    @staticmethod    
    def multiclass_accuracy(truth,predictions):
        """
        Input
        truth: a list of true labels
        predictions: a list of predicted labels
        Output
        a single value - the multiclass accuracy
        """
        cmp = np.array(truth) == np.array(predictions)
        wrong = len(cmp[cmp == False])
        right = len(cmp[cmp == True])
        accuracy = right/(wrong + right)
        return(accuracy)

## Utilize ProcessPoolExecutor
Source:  https://docs.python.org/3/library/concurrent.futures.html

## Problem 17

Compute your classifier's accuracy. Remember to compare your output `test_predictions` against only the first 200 elements of `images_test_labels`. Multiclass accuracy is just the proportion of cases that are correctly predicted. In other words, it is the average of the vector that is `[0,1,0,1,1,1,...]` where 0 represent "not equal to the true label" and 1 represents "equal to the true label."

In [None]:
accuracy = RetrieveDistances.multiclass_accuracy(images_test_labels[:testSize], predictedLabels)
print("Classifier Accuracy (k = 1):",'{:.1%}'.format(accuracy))

In [144]:
import concurrent.futures
# import math

class MultiProcess(object):
    
    def __init__(self, testingImages, testingLabels, trainingImages, trainingLabels, kSize):
        self.start_time = None
        MultiProcess.testingImages = testingImages
        MultiProcess.testingLabels = testingLabels
        MultiProcess.trainingImages = trainingImages
        MultiProcess.trainingLabels = trainingLabels
        MultiProcess.kSize = kSize       
        
    @staticmethod 
    def testMethod(item):       
        intermediateResult = KnnClassifier.knn(item, 
                                                   MultiProcess.testingImages, 
                                                   MultiProcess.trainingImages, 
                                                   MultiProcess.trainingLabels, 
                                                   MultiProcess.kSize)
        return intermediateResult

    def __main(self, tupList, numProcesses):       
        self.start_time = datetime.now()
        finalResult = []        
        with concurrent.futures.ProcessPoolExecutor(max_workers = numProcesses) as executor:
            for tupRange, label in zip(tupList, executor.map(MultiProcess.testMethod, tupList, timeout=None, chunksize=1)):               
                for item in label:                    
                    finalResult.append(item)
        return finalResult
            
    def __getInput(self, size, interval):    
        arr = []       
        remainder = size % interval
        iterations = int(size/interval)
        for i in range(0,iterations):
            start = i*interval
            end = start + interval        
            arr.append((start, end)) 
        if(remainder != 0):
            arr.append((end, end + remainder))       
        return arr

    def invokeMultiProcessMethod(self, numProcesses, intervalSize):       
        maxNum = MultiProcess.testingImages.shape[0]       
        tupList = self.__getInput(maxNum, intervalSize)
        print("Input work items:  ", tupList)        
        predictedLabels = self.__main(tupList, numProcesses)
        self.__reportResults(predictedLabels, maxNum, numProcesses)
        #return predictedLabels    
    
    def __reportResults(self, predictedLabels, maxNum, numProcesses):
        print("Predicted Labels:  ", predictedLabels)
        end_time = datetime.now()
        diff = end_time - self.start_time
        print("Test Image Set Size: ", maxNum)
        print("Number of Processes:", numProcesses)
        print("Start Time =", 
              self.start_time.strftime('%H:%M:%S.%f'), 
              "\nEnd Time =", end_time.strftime('%H:%M:%S.%f'), 
              "\nTotal Run Time (Seconds) =", 
              diff.total_seconds())
        accuracy = KnnClassifier.multiclass_accuracy(MultiProcess.testingLabels, predictedLabels)
        print("Classifier Accuracy (k = 1):",'{:.1%}'.format(accuracy), "\n")        

In [145]:
myTestSize = 200
multiProcess1 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess1.invokeMultiProcessMethod(numProcesses=1, intervalSize=2)

multiProcess2 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess2.invokeMultiProcessMethod(numProcesses=2, intervalSize=2)

multiProcess3 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess3.invokeMultiProcessMethod(numProcesses=3, intervalSize=2)

multiProcess4 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess4.invokeMultiProcessMethod(numProcesses=4, intervalSize=2)

multiProcess5 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess5.invokeMultiProcessMethod(numProcesses=5, intervalSize=2)

multiProcess6 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess6.invokeMultiProcessMethod(numProcesses=6, intervalSize=2)

multiProcess7 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess7.invokeMultiProcessMethod(numProcesses=7, intervalSize=2)

multiProcess8 = MultiProcess(testingImages=images_test[:myTestSize], 
                            testingLabels=images_test_labels[:myTestSize],
                            trainingImages=images_train, 
                            trainingLabels=images_train_labels, 
                            kSize = 1)
multiProcess8.invokeMultiProcessMethod(numProcesses=8, intervalSize=2)

Input work items:   [(0, 2), (2, 4), (4, 6), (6, 8), (8, 10)]
Predicted Labels:   [7, 2, 1, 0, 4, 1, 4, 9, 5, 9]
Test Image Set Size:  10
Number of Processes: 1
Start Time = 00:25:17.982653 
End Time = 00:25:33.619946 
Total Run Time (Seconds) = 15.637293
Classifier Accuracy (k = 1): 100.0% 

Input work items:   [(0, 2), (2, 4), (4, 6), (6, 8), (8, 10)]
Predicted Labels:   [7, 2, 1, 0, 4, 1, 4, 9, 5, 9]
Test Image Set Size:  10
Number of Processes: 1
Start Time = 00:25:33.628271 
End Time = 00:25:49.070369 
Total Run Time (Seconds) = 15.442098
Classifier Accuracy (k = 1): 100.0% 

Input work items:   [(0, 2), (2, 4), (4, 6), (6, 8), (8, 10)]
Predicted Labels:   [7, 2, 1, 0, 4, 1, 4, 9, 5, 9]
Test Image Set Size:  10
Number of Processes: 2
Start Time = 00:25:49.075373 
End Time = 00:25:59.147425 
Total Run Time (Seconds) = 10.072052
Classifier Accuracy (k = 1): 100.0% 

Input work items:   [(0, 2), (2, 4), (4, 6), (6, 8), (8, 10)]
Predicted Labels:   [7, 2, 1, 0, 4, 1, 4, 9, 5, 9]
Test 