In [7]:
import numpy as np
from os import listdir
import operator

##### Processing Functions

In [2]:
def img2vector(filename):
    """
    Assuming the input images are 32*32
    
    Input: the image location
    
    Return: the vectorized image
    """
    vector = np.zeros((1, 32*32))
    file = open(filename)
    for i in range(32):
        line = file.readline()
        for j in range(32):
            vector[0,i*32+j] = int(line[j])
    return vector
    

In [10]:
def loadData(file_location):
    # load the training images
    FileList = listdir(file_location)
    Mat = np.zeros((len(FileList), 32*32))
    Labels = []
    
    for i in range(len(FileList)):
        filename = file_location + FileList[i]
        Mat[i, :] = img2vector(filename)
        Labels.append(int(FileList[i].split('_')[0]))
    return Mat, Labels


##### KNN classifier

In [11]:
def classify(inX, dataSet, labels, k):
    # get the distances
    m = dataSet.shape[0]
    diffMat = np.tile(inX, (m, 1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    # sort the distance
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        votedLabel = labels[sortedDistIndicies[i]]
        classCount[votedLabel] = classCount.get(votedLabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    
    return sortedClassCount[0][0] # get the label of the largest voted class

##### Hand Written Digit Recognition

In [None]:
trainMat, trainLabels = loadData('trainingDigits/')
testMat, testLabels = loadData('testDigits/')

errorCount = 0
for i in range(len(testMat)):
    result = classify(testMat[i,:], trainMat, trainLabels, 3)
    #print("The KNN gives: %d, the real label is: %d" % (result, testLabels[i]))
    if result != testLabels[i]: 
        errorCount += 1
print("The total number of error is: %d" % errorCount)
print("The error rate is: %f" % (errorCount / float(len(testMat))))
    