In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame
arr = np.array

前面调参只是一直在调K的值，但是KNN实际除了K这个参数还有距离度量方式这个参数，因此这里便来测试这个新的参数。

# 读取数据

In [2]:
dirPath = "data preprocessed\\unnormalized"

trainSet_origin = np.loadtxt(dirPath + '\\train.csv', delimiter=",")
validateSet_origin = np.loadtxt(dirPath + '\\validate.csv', delimiter=",")
testSet = np.loadtxt(dirPath + '\\test.csv', delimiter=",")

trainSet_origin.shape, validateSet_origin.shape, testSet.shape

((33600, 18), (14400, 18), (12000, 17))

划分数据集特征和标签：

In [4]:
trainSet_merge = np.row_stack([trainSet_origin, validateSet_origin])
trainSet_merge.shape

trainSet, trainSetLabel = trainSet_merge[:, :-1], trainSet_merge[:, -1]
trainSet.shape

(48000, 18)

(48000, 17)

# 实现评测指标

In [5]:
def calcF1(predictLabel, realLabel, printRes=False):
    '''给定算法得到的分类结果和实际的分类结果，计算评测指标
    注意：标签为1则为正标签，标签不为1则为负标签'''
    def safeDivide(a, b):
        '''子函数：当分母为0时返回除式结果为 -1'''
        return a/b if b != 0 else 0
    
    TP = Counter(np.logical_and(predictLabel==1, realLabel==1))[True]
    FN = Counter(np.logical_and(predictLabel!=1, realLabel==1))[True]
    FP = Counter(np.logical_and(predictLabel==1, realLabel!=1))[True]
    if(printRes):
        print("TP FN FP", TP, FN, FP)
    F1 = safeDivide(2*TP, 2*TP+FN+FP)
    return F1

print('test 1: normal one')
calcF1(arr([1, 1, 0, 0]), arr([1, 0, 1, 0]), True)
print('\ntest 2: may have zero division error')
calcF1(arr([0, 0, 1, 0]), arr([1, 0, 1, 0]), True)

test 1: normal one
TP FN FP 1 1 1


0.5


test 2: may have zero division error
TP FN FP 1 1 0


0.6666666666666666

# 距离度量方式

欧式距离之前测试过了，这里便测试下其他9种距离。

参考：https://www.cnblogs.com/heaad/archive/2011/03/08/1977733.html

In [6]:
class distanceMetric:
    
    def __init__(self, tag):
        self.tag = tag
        
    def Manhattan(self, dataSet, inputVector):
        '''曼哈顿距离'''
        diff = np.abs(dataSet - inputVector)
        distances = diff.sum(axis=1)
        return distances
    
    def Chebyshev(self, dataSet, inputVector):
        '''切比雪夫距离'''
        diff = np.abs(dataSet - inputVector)
        distances = np.max(diff, axis=1)
        return distances

    def Minkowski(self, dataSet, inputVector):
        '''闵可夫斯基距离:
        闵氏距离不是一种距离，而是一组距离的定义。
        
        其中p是一个变参数。

        当p=1时，就是曼哈顿距离

        当p=2时，就是欧氏距离

        当p→∞时，就是切比雪夫距离
        
        这里暂不实现'''
        pass    
    
    def StandardizedEuclidean(self, dataSet, inputVector):
        '''标准化欧氏距离:
        这个也测试过了，在这个project上不适用'''
        pass
    
    def Mahalanobis(self, dataSet, inputVector):
        '''马氏距离'''
        def SubMahalanobis(vec1, vec2):
            npvec = np.array([vec1, vec2])
            sub = npvec.T[0]-npvec.T[1]
            #print(np.cov(vec1, vec2))
            inv_sub = np.linalg.pinv(np.cov(vec1, vec2))
            return np.sqrt(np.dot(inv_sub, sub).dot(sub.T))  
        
        distances = np.zeros(dataSet.shape[0])
        for i in range(dataSet.shape[0]):
            distances[i] = SubMahalanobis(dataSet[i,:], inputVector)
        return distances
    
    def Cosine(self, dataSet, inputVector):
        '''夹角余弦'''
        a = np.dot(dataSet, inputVector.T)
        b = np.sqrt(np.sum(dataSet**2, axis=1))
        c = np.sqrt(np.sum(inputVector**2))
        distances = a/(b*c)
        return distances
    
    def Hamming(self, dataSet, inputVector):
        '''汉明距离:
        两个等长字符串s1与s2之间的汉明距离定义为将其中一个变为另外一个所需要作的最小替换次数。
        例如字符串“1111”与“1001”之间的汉明距离为2。
        这个在离散变量上可能有效果，但是在连续数据上直接应用怕是不行'''
        pass    
    
    def JaccardSimilarity(self, dataSet, inputVector):
        '''杰卡德相似系数：
        两个集合A和B的交集元素在A，B的并集中所占的比例，称为两个集合的杰卡德相似系数
        在此次project怕也是无法应用'''
        pass
    
    def Correlation(self, dataSet, inputVector):
        '''相关距离'''
        distances = np.zeros(dataSet.shape[0])
        for i in range(dataSet.shape[0]):
            distances[i] = 1 - np.corrcoef(np.vstack([dataSet[i,:], inputVector]))[0,1]
#         distances = 1 - np.corrcoef(np.vstack([dataSet, inputVector]))[]
        return distances
     
    def calcDistance(self, dataSet, inputVector):
        
        if self.tag == 1:
            return self.Manhattan(dataSet, inputVector)
        elif self.tag == 2:
            return self.Chebyshev(dataSet, inputVector)
        elif self.tag == 3:
            return self.Cosine(dataSet, inputVector)
        elif self.tag == 4:
            return self.Mahalanobis(dataSet, inputVector)
        else:
            return self.Correlation(dataSet, inputVector)

############### 函数测试 ##################                
# distanceMetric(1).calcDistance(arr([[1,1,0,0],
#                                     [0,1,1,1],
#                                     [1,0,0,1]]), arr([0,1,1,0]))

# distanceMetric(2).calcDistance(arr([[2,1,0,0],
#                                     [0,1,8,1],
#                                     [1,0,0,6]]), arr([0,1,1,0]))

# distanceMetric(4).calcDistance(arr([[2,1,0,0],
#                                     [0,1,8,1],
#                                     [1,0,0,6]]), arr([0,1,1,0]))
                    
# distanceMetric(3).calcDistance(arr([[2,1,0,0],
#                                     [0,1,8,1],
#                                     [1,0,0,6]]), arr([0,1,1,0]))

# distanceMetric(5).calcDistance(arr([[1,2,3,4]]), arr([3,8,7,6]))

# KNN

In [7]:
def knn_classify(dataSet, labels, k, inputVector, distanceMetricID):
    '''使用knn对输入的向量进行分类，使用距离加权'''
    dm = distanceMetric(distanceMetricID)
    distances = dm.calcDistance(dataSet, inputVector)
    #得到 K 个近邻的下标
    kNeighborsIndex  = distances.argpartition(k-1)[0:k]
    #计算 K 个近邻的距离倒数
    kDatas = 1/distances[kNeighborsIndex]
    #得到 对应的 K 个标签
    kLabels = labels[kNeighborsIndex]
    
    predictLabel = np.NAN
    maxDis = -np.inf
    for i in np.unique(kLabels):
        dis = np.sum(kDatas[kLabels==i]) 
        #print(dis)
        if dis > maxDis:
            maxDis = dis
            predictLabel = i
    #返回分类结果
    return predictLabel

##############测试程序#######################
knn_classify(arr([[1,1,0,0],
              [0,1,1,1],
              [1,0,0,1]]).copy(), arr([1, 0, 1]), 3, arr([0,1,1,0]), 2)
##############测试程序#######################

1

# 调参

所调参数只有K。

In [10]:
def run_knn_classify(dataSet, k, distanceId):
    ansLabel = []
    for i in tnrange(len(dataSet), leave=False):
        ansLabel.append(knn_classify(trainSet, trainSetLabel, k, dataSet[i], distanceId))
    return arr(ansLabel)

def getTestF1(k, distanceId):
    ansLabel = run_knn_classify(validateSet, k, distanceId)
    return calcF1(ansLabel, validateSetLabel)

## 使用曼哈顿距离

In [13]:
ansLabel = run_knn_classify(testSet, 1, 1)
np.savetxt('.\\rank\\47_v1.csv', ansLabel, fmt="%d", delimiter="\n")

