In [2]:
# knn实现约会网站对象分类
import numpy as np
import operator
import matplotlib.pyplot as plt


# 将一个文件中的所有数据转换为矩阵(numpy二维数组)
# 数据一共三个特征
# 第一个特征为 每年获得的飞行常客里程数
# 第二个特征为 玩视频游戏所占时间百分比
# 第三个特征为 每周消费的冰淇淋公升数
def file2matrix(filename):
    fr = open(filename)
    array_on_lines = fr.readlines()
    num_of_lines = len(array_on_lines)
    return_mat = np.zeros((num_of_lines, 3))  # 函数返回的矩阵
    class_label_vector = []
    index = 0
    for line in array_on_lines:
        line = line.strip()
        list_from_line = line.split('\t')
        return_mat[index,:] = list_from_line[0:3]
        class_label_vector.append(int(list_from_line[-1]))
        index += 1
    return return_mat, class_label_vector
 
    
# 训练数据集可视化   
def plot(dating_data_mat, class_label_vector):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(dating_data_mat[:,0], dating_data_mat[:,1], 15.0*np.array(class_label_vector), 15.0*np.array(class_label_vector))
    plt.show()
    
    
# 归一化训练数据集的特征值
# 根据公式 newValue=(oldValue-min)/(max-min) 进行归一化
def auto_norm(dating_data_mat):
    min_val_vector = dating_data_mat.min(0)  # 参数为0表示每一列的最小值，1表示每一行的最小值
    max_val_vector = dating_data_mat.max(0)
    ranges_vector = max_val_vector - min_val_vector
    normed_dating_data_mat = np.zeros(np.shape(dating_data_mat))
    num_of_data = dating_data_mat.shape[0]
    normed_dating_data_mat = dating_data_mat - np.tile(min_val_vector, (num_of_data, 1))
    normed_dating_data_mat = normed_dating_data_mat / np.tile(ranges_vector, (num_of_data, 1))
    return normed_dating_data_mat, ranges_vector, min_val_vector


# k近邻分类算法
# inputX为用于待分类的输入向量
# dataset为训练样本数据集
# labels为数据集的标签
# k为选择最近邻居的数量
def knn(input_x, dataset, labels, k):
    dataset_size = dataset.shape[0]
    # 计算已知类别数据集中的点与当前点的距离
    distance_mat = np.tile(input_x, (dataset_size, 1)) - dataset
    distance_mat = distance_mat**2
    sq_distance_list = distance_mat.sum(axis=1)
    distance_list = sq_distance_list**0.5
    sorted_dist_index_list = distance_list.argsort()  # 递增排序，获取到的是排序后的索引 
    # 选择距离最小的k个点
    class_count_dict = {}  # 前k个距离最近的点钟每个类别及出现的次数
    for i in range(k):
        vote_i_label = labels[sorted_dist_index_list[i]]
        class_count_dict[vote_i_label] = class_count_dict.get(vote_i_label, 0) + 1
    sorted_class_count_list = sorted(class_count_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_class_count_list[0][0]  # [0][0]选取次数出现最多的类别的类名，sorted_class_count为list格式


# 分类器效果测试函数
def test(normed_dating_data_mat, class_label_vector):
    ho_ratio = 0.10
    num_of_data = normed_dating_data_mat.shape[0]
    error_count = 0
    num_of_test_data = int(num_of_data * ho_ratio)
    for i in range(num_of_test_data):
        classifer_result = knn(normed_dating_data_mat[i,:], 
                               normed_dating_data_mat[num_of_test_data:num_of_data, :], 
                               class_label_vector[num_of_test_data:num_of_data], 
                               3)
        # print('The knn classifier came back with: %d, the real number is: %d' % (classifer_result, class_label_vector[i]))
        if classifer_result != class_label_vector[i]:
            error_count += 1
    print('The total error rate is: %f' % (error_count/float(num_of_test_data)))
    
    
# main函数
def main():
    filename = './datasets/datingTestSet2.txt'
    dating_data_mat, class_label_vector = file2matrix(filename)
    normed_dating_data_mat, ranges_vector, min_val_vector = auto_norm(dating_data_mat)
    # plot(normed_dating_data_mat, class_label_vector)
    test(normed_dating_data_mat, class_label_vector)
    
    
if __name__ == '__main__':
    main()

The total error rate is: 0.050000
