In [11]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
InteractiveShell.ast_node_interactivity = "all"

In [28]:
import pandas as pd

def loadDataSet(filePath):
    '''读取数据集函数'''
    #读取CSV文件
    df = pd.read_csv(filePath)
    #得到数据集标签
    label = list(df['label'].values)  
    #得到数据集
    dataSet = [i.strip().split(' ') for i in list(df['Words'].values)]
    #得到数据集的所有不重复的词
    allWords = list(set([j for i in dataSet for j in i]))
    
    ############输出数据集相关信息###########################
    #输出第一行数据
    print('【data preview】:')
    display(df)
    #输出所有label的分布
    print('【count of all kind of labels】:\n')
    print(df['label'].value_counts())
    #输出所有的词的个数
    print('【number of all words】: ', len(allWords))
    print('【number of texts】: ', len(df))
    ############输出数据集相关信息###########################
    
    return dataSet, label, allWords

'positive'

In [13]:
trainSet, trainSet_label, allWords_trainSet = loadDataSet('.\\data\\train_set.csv')

【data preview】:


Unnamed: 0,Words,label
0,I can not find test,sad
1,hello this is my girlfriend,sad
2,She is luhan's girlfriend,sad
3,I am not his girfriend,sad
4,I can not stand his girlfriend,anger


【count of all kind of labels】:

sad      4
anger    1
Name: label, dtype: int64
【number of all words】:  16
【number of texts】:  5


In [14]:
testSet, _ , allWords_testSet = loadDataSet('.\\data\\test_set.csv')

【data preview】:


Unnamed: 0,Words,label
0,I can not stand I am not luhan's grilfriend,?


【count of all kind of labels】:

?    1
Name: label, dtype: int64
【number of all words】:  7
【number of texts】:  1


In [16]:
allWords_train_test = list(set(allWords_trainSet).union(allWords_testSet))
allWords_train_test

['is',
 'can',
 'girfriend',
 'girlfriend',
 'my',
 'She',
 "luhan's",
 'stand',
 'grilfriend',
 'this',
 'test',
 'find',
 'am',
 'hello',
 'his',
 'not',
 'I']

In [17]:
from collections import Counter

def getTF(dataSet, allWords):
    '''得到输入数据集的TF矩阵'''
    TF=[]
    for index, doc in enumerate(dataSet):
        TF.append([])
        wordCounter = Counter(doc)
        for word in allWords:
            TF[index].append(wordCounter.get(word,0)/len(doc))
    return TF

#得到训练集、测试集的TF矩阵
TF_trainSet = getTF(trainSet, allWords_train_test)
TF_testSet = getTF(testSet, allWords_train_test)

In [26]:
# import numpy as np
# np.array(TF_trainSet)
# np.array(TF_testSet)

In [38]:
import numpy as np
from collections import Counter

def knn_classify(dataSet, labels, k, inputVector, printTopK=True):
    '''使用knn对输入的向量进行分类，使用欧式距离'''
    #得到输入向量与数据集的差值的平方
    diffMat = (np.array(dataSet) - np.array(inputVector))**2
    #计算每一行上元素的和并开方，得到距离向量
    distances = np.sqrt(diffMat.sum(axis=1))
    #得到 K个近邻的下标
    kNeighborsIndex  = distances.argpartition(k-1)[0:k]
    if printTopK:
        print('index:', kNeighborsIndex)
        print('dis: ', distances[kNeighborsIndex])
        print('label:', np.array(labels)[kNeighborsIndex])
    #返回分类结果
    return Counter(np.array(labels)[kNeighborsIndex]).most_common(1)[0][0]

knn_classify([[1,1,0,0],
              [0,1,1,1],
              [1,0,0,1]],['positive','negative','positive'], 3, [0,1,1,0])

index: [1 0 2]
dis:  [ 1.          1.41421356  2.        ]
label: ['negative' 'positive' 'positive']


'positive'

In [39]:
def run_knn_classify(k):
    '''输出基于【欧式距离】+【TF矩阵】的3NN分类的3个最近样本，并输出最终预测结果'''
    for index, wordVector in enumerate(TF_testSet):
        ans = knn_classify(TF_trainSet, trainSet_label, k, wordVector)
        print(ans)
run_knn_classify(3)

index: [4 3 0]
dis:  [ 0.32394177  0.3718489   0.3718489 ]
label: ['anger' 'sad' 'sad']
sad
