## KNN算法实现手写数字的识别

In [2]:
#coding=utf-8

In [10]:
from numpy import *
import operator
import os

#### KNN分类器的设计

In [24]:
# classify using KNN
def kNNClassify(newInput, dataSet, labels, k):
    numSamples = dataSet.shape[0]    #样本数
    diff = tile(newInput, (numSamples, 1)) - dataSet  #计算与每个样本点的差
    distance = sum(diff ** 2, axis = 1) ** 0.5        #计算欧几里得距离
    sortDisIndices = argsort(distance)                #从小到大排序，返回索引值
    
    #计算k近邻的样本的类型及个数
    classCount = {}
    for i in range(k):
        label = labels[sortDisIndices[i]]
        classCount[label] = classCount.get(label, 0) + 1
    
    #求个数最多的类型
    maxValue = 0
    for key, value in classCount.items():
        if value > maxValue:
            maxValue = value
            maxKey = key
            
    return maxKey
    
        

#### 图像转化成向量

In [20]:
# covert image to vector
def img2vector(filename):
    rows = 32
    cols = 32
    imgVector = zeros((1, rows * cols))
    fileIn = open(filename)
    
    for row in range(rows):
        lineStr = fileIn.readline()
        for col in range(cols):
            imgVector[0, row * 32 + col] = int(lineStr[col])

    return imgVector
    
    

#### 加载数据集

In [13]:
#load dataSet
def loadDataSet():
    ## step1: Getting training set
    print ('---Getting training set...')
    dataSetDir = 'C:/Users/James_kk/Documents/GitHub/KDD-example/simplemod/'
    
    trainingFileList = os.listdir(dataSetDir + 'trainingDigits') #加载训练数据
    numSamples = len(trainingFileList)
    
    train_x = zeros((numSamples, 1024))
    train_y = []
    
    for i in range(numSamples):
        filename = trainingFileList[i]
        train_x[i, :] = img2vector(dataSetDir + 'trainingDigits/%s' % filename)
        
        #从文件名字让得到标记,例如 '1_18.txt'
        label = int(filename.split('_')[0])
        train_y.append(label)
        
    ##step2: Getting testing set
    print('---Getting testing set...')
    testingFileList = os.listdir(dataSetDir + 'testDigits')
    numSamples = len(testingFileList)
    
    test_x = zeros((numSamples, 1024))
    test_y = []
    
    for i in range(numSamples):
        filename = testingFileList[i]
        test_x[i, :] = img2vector(dataSetDir + 'testDigits/%s' % filename)
        
        label = int(filename.split('_')[0])
        test_y.append(label)
        
    return train_x, train_y, test_x, test_y




#### 测试手写数字分类

In [14]:
# test hand writing class
def testHandWritingClass():
    print('step1: load data...')
    train_x, train_y, test_x, test_y = loadDataSet()
    
    print('step2: training...')
    
    print('step3: testing...')
    numTestSamples = test_x.shape[0]
    matchCount = 0
    
    for i in range(numTestSamples):
        predict = kNNClassify(test_x[i], train_x, train_y, 3)
        if predict == test_y[i]:
            matchCount += 1
            
    accuracy = float(matchCount) / numTestSamples
    
    print('step4: show the result...')
    print('The classify accuracy is: %.2f%%' % (accuracy * 100))
    
    
    
    
    

In [None]:
testHandWritingClass()

step1: load data...
---Getting training set...
---Getting testing set...
step2: training...
step3: testing...
