# 实验内容

```
data
├─classification_dataset
│      15351234_Sample_KNN_classification.csv
│      15351234_Sample_NB_classification.csv
│      test_set.csv
│      train_set.csv
│      validation_set.csv
│
└─regression_dataset
        15351234_Sample_KNN_regression.csv
        15351234_Sample_NB_regression.csv
        test_set.csv
        train_set.csv
        validation_set.csv
        validation相关度评估.xlsx
```


# 分类

## 数据预处理及分析


### 数据读取函数实现

In [63]:
import pandas as pd

def loadDataSet(filePath):
    '''读取数据集函数'''
    #读取CSV文件
    df = pd.read_csv(filePath)
    #得到数据集标签
    label = list(df['label'].values)  
    #得到数据集
    dataSet = [i.strip().split(' ') for i in list(df['Words (split by space)'].values)]
    #得到数据集的所有不重复的词
    allWords = list(set([j for i in dataSet for j in i]))
    
    ############输出数据集相关信息###########################
    #输出第一行数据
    print('【one line\'s data preview】:')
    display(df.head(1))
    #输出所有label的分布
    print('【count of all kind of labels】:\n')
    print(df['label'].value_counts())
    #输出所有的词的个数
    print('【number of all words】: ', len(allWords))
    print('【number of texts】: ', len(df))
    ############输出数据集相关信息###########################
    
    return dataSet, label, allWords

### 读取训练集、验证集、测试集

In [64]:
dataPath = '.\\data\\classification_dataset\\'
trainSet, trainSet_label, allWords_trainSet = loadDataSet(dataPath+'train_set.csv')

【one line's data preview】:


Unnamed: 0,Words (split by space),label
0,europe retain trophy with big win,joy


【count of all kind of labels】:

joy         222
sad         132
surprise    113
fear         95
anger        41
disgust      20
Name: label, dtype: int64
【number of all words】:  2087
【number of texts】:  623


In [73]:
validateSet, validateSet_label, _ = loadDataSet(dataPath+'validation_set.csv')

【one line's data preview】:


Unnamed: 0,Words (split by space),label
0,marijuana helps ease hiv nerve pain study says,surprise


【count of all kind of labels】:

joy         112
sad          65
fear         54
surprise     46
anger        21
disgust      13
Name: label, dtype: int64
【number of all words】:  1235
【number of texts】:  311


In [74]:
testSet, _ , _ = loadDataSet(dataPath+'test_set.csv')

【one line's data preview】:


Unnamed: 0,textid,Words (split by space),label
0,1,senator carl krueger thinks ipods can kill you,?


【count of all kind of labels】:

?    312
Name: label, dtype: int64
【number of all words】:  1273
【number of texts】:  312


### 得到3个数据集的onehot矩阵

In [10]:
def getOneHot(dataSet, allWords):
    '''得到输入数据集的one-hot矩阵'''
    oneHot=[]
    for index, doc in enumerate(dataSet):
        oneHot.append([])
        for word in allWords:
            if word in doc:
                oneHot[index].append(1)
            else: 
                oneHot[index].append(0)
    return oneHot

#得到训练集、验证集、测试集的onehot矩阵
oneHot_trainSet = getOneHot(trainSet, allWords_trainSet)
oneHot_validateSet = getOneHot(validateSet, allWords_trainSet)
oneHot_testSet = getOneHot(testSet, allWords_trainSet)

## knn分类算法

### 分类函数实现及简单测试

In [11]:
import numpy as np
from collections import Counter

def knn_classify(dataSet, labels, k, inputVector):
    '''使用knn对输入的向量进行分类，使用欧式距离'''
    #得到输入向量与数据集的差值的平方
    diffMat = (np.array(dataSet) - np.array(inputVector))**2
    #计算每一行上元素的和并开方，得到距离向量
    distances = np.sqrt(diffMat.sum(axis=1))
    #得到 K个近邻的下标
    kNeighborsIndex  = distances.argpartition(k-1)[0:k]
    #返回分类结果
    return Counter(np.array(labels)[kNeighborsIndex]).most_common(1)[0][0]

knn_classify([[1,1,0,0],
              [0,1,1,1],
              [1,0,0,1]],['positive','negative','positive'], 3, [0,1,1,0])

'positive'

In [34]:
'euclidean'
'manhattan'
'cosine'

'euclidean'

'manhattan'

'cosine'

### 使用验证集调参

In [200]:
def run_knn_classify(k):
    '''使用验证集得到分类准确率，进行调参'''
    hitNum = 0
    for index, wordVector in enumerate(dataSet):
        ans = knn_classify(oneHot_trainSet, trainSet_label, k, wordVector)
        if  ans == validateSet_label[index]:
            hitNum +=1
    print("k = %2d : accuracy: %.5f%%" % (k, 100*hitNum/len(oneHot_validateSet)))
    
for k in range(1,50):
    run_knn_classify(k)

k =  1 : accuray: 37.29904%
k =  2 : accuray: 25.72347%
k =  3 : accuray: 32.79743%
k =  4 : accuray: 30.54662%
k =  5 : accuray: 32.79743%
k =  6 : accuray: 32.47588%
k =  7 : accuray: 34.08360%
k =  8 : accuray: 36.01286%
k =  9 : accuray: 36.33441%
k = 10 : accuray: 38.26367%
k = 11 : accuray: 38.58521%
k = 12 : accuray: 39.54984%
k = 13 : accuray: 39.22830%
k = 14 : accuray: 42.12219%
k = 15 : accuray: 40.19293%
k = 16 : accuray: 41.15756%
k = 17 : accuray: 38.90675%
k = 18 : accuray: 37.94212%
k = 19 : accuray: 37.29904%
k = 20 : accuray: 38.58521%
k = 21 : accuray: 38.58521%
k = 22 : accuray: 38.90675%
k = 23 : accuray: 37.62058%
k = 24 : accuray: 37.94212%
k = 25 : accuray: 37.62058%
k = 26 : accuray: 37.62058%
k = 27 : accuray: 37.62058%
k = 28 : accuray: 37.62058%
k = 29 : accuray: 37.94212%
k = 30 : accuray: 37.29904%
k = 31 : accuray: 37.29904%
k = 32 : accuray: 37.62058%
k = 33 : accuray: 38.90675%
k = 34 : accuray: 39.54984%
k = 35 : accuray: 39.54984%
k = 36 : accuray: 39

### 对测试集进行分类

In [27]:
outputFileName = "15352220_linzecheng_KNN_classification.csv"
bestK = 14
textid_and_label = []
for index, wordVector in enumerate(oneHot_testSet):
        ans = knn_classify(oneHot_trainSet, trainSet_label, bestK, wordVector)
        textid_and_label.append((index+1, ans))
        
res = pd.DataFrame(textid_and_label, columns=['textid','label'])
res.to_csv(outputFileName, index=False)

In [33]:
res['label'].value_counts()

joy         259
fear         32
sad          20
surprise      1
Name: label, dtype: int64

# 回归

## 数据预处理及分析

### 数据读取函数实现

In [61]:
import pandas as pd

def loadDataSet2(filePath):
    '''读取数据集函数'''
    #读取CSV文件
    df = pd.read_csv(filePath)
    #得到数据集标签
    label = dict()
    label['anger'] = list(df['anger'].values)  
    label['disgust'] = list(df['disgust'].values)  
    label['fear'] = list(df['fear'].values)
    label['joy'] = list(df['joy'].values)  
    label['sad'] = list(df['sad'].values)  
    label['surprise'] = list(df['surprise'].values)  
    #得到数据集
    dataSet = [i.strip().split(' ') for i in list(df['Words (split by space)'].values)]
    #得到数据集的所有不重复的词
    allWords = list(set([j for i in dataSet for j in i]))
    
    ############输出数据集相关信息###########################
    #输出第一行数据
    print('【one line\'s data preview】:')
    display(df.head(1))
    #输出所有情感值的一些统计数据
    print('【some summary statistics of labels】:')
    print(df[['anger','disgust','fear','joy','sad','surprise']].describe())
    #输出所有的词的个数
    print('【number of all words】: ', len(allWords))
    print('【number of texts】: ', len(df))
    ############输出数据集相关信息###########################
    
    return dataSet, label, allWords

### 读取训练集、验证集、测试集

In [69]:
dataPath2 = '.\\data\\regression_dataset\\'
trainSet2, trainSet_label2, allWords_trainSet2 = loadDataSet2(dataPath2+'train_set.csv')

【one line's data preview】:


Unnamed: 0,Words (split by space),anger,disgust,fear,joy,sad,surprise
0,europe retain trophy with big win,0.0,0.0,0.0,0.8721,0.0,0.1279


【some summary statistics of labels】:
            anger     disgust        fear         joy         sad    surprise
count  623.000000  623.000000  623.000000  623.000000  623.000000  623.000000
mean     0.086573    0.052949    0.157176    0.281344    0.191442    0.230517
std      0.123334    0.090709    0.174959    0.317420    0.206464    0.199993
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.090100
50%      0.022700    0.000000    0.115400    0.130800    0.142900    0.176500
75%      0.144950    0.083300    0.255450    0.566950    0.293450    0.326750
max      0.753400    0.571400    0.895800    1.000000    1.000000    1.000000
【number of all words】:  2087
【number of texts】:  623


In [70]:
validateSet2, validateSet_label2, _ = loadDataSet2(dataPath2+'validation_set.csv')

【one line's data preview】:


Unnamed: 0,Words (split by space),anger,disgust,fear,joy,sad,surprise
0,marijuana helps ease hiv nerve pain study says,0.0,0.0,0.0744,0.2727,0.0992,0.5537


【some summary statistics of labels】:
            anger     disgust        fear         joy         sad    surprise
count  311.000000  311.000000  311.000000  311.000000  311.000000  311.000000
mean     0.085478    0.062534    0.151173    0.287755    0.194680    0.218382
std      0.125672    0.110057    0.175016    0.310162    0.208836    0.189515
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.073650
50%      0.012200    0.000000    0.088900    0.177800    0.138900    0.181800
75%      0.150600    0.098400    0.245550    0.553650    0.300900    0.320900
max      0.777800    0.785700    0.815400    1.000000    0.903200    1.000000
【number of all words】:  1235
【number of texts】:  311


In [75]:
testSet2, _ , _ = loadDataSet2(dataPath2+'test_set.csv')

【one line's data preview】:


Unnamed: 0,textid,Words (split by space),anger,disgust,fear,joy,sad,surprise
0,1,senator carl krueger thinks ipods can kill you,?,?,?,?,?,?


【some summary statistics of labels】:
       anger disgust fear  joy  sad surprise
count    312     312  312  312  312      312
unique     1       1    1    1    1        1
top        ?       ?    ?    ?    ?        ?
freq     312     312  312  312  312      312
【number of all words】:  1273
【number of texts】:  312


###  得到3个数据集的onehot矩阵

In [77]:
#得到训练集、验证集、测试集的onehot矩阵
oneHot_trainSet2 = getOneHot(trainSet2, allWords_trainSet2)
oneHot_validateSet2 = getOneHot(validateSet2, allWords_trainSet2)
oneHot_testSet2 = getOneHot(testSet2, allWords_trainSet2)

## knn回归算法

### 权值归一化函数实现

不同意PPT里的权值归一化的说法，下面这个函数暂时不会使用。

In [230]:
def weightNormalize(weight, method='min-max'):
    '''权值归一化函数，输入权值类型为numpy.array'''
    if len(weight) == 1:
        return weight
    elif method == 'min-max':
        return (weight-weight.min())/(weight.max()-weight.min())
    elif method == 'z-score':
        return (weight-weight.mean())/weight.std()

weightNormalize(np.array([1,2,3]))
weightNormalize(np.array([1,2,3]),'z-score')

weightNormalize(np.array([5]))

array([ 0. ,  0.5,  1. ])

array([-1.22474487,  0.        ,  1.22474487])

array([5])

### 回归函数实现及简单测试

In [245]:
import numpy as np
from collections import Counter

def knn_regress(dataSet, labels, k, inputVector):
    '''使用knn对输入的向量进行回归，先默认使用欧式距离'''
    #得到输入向量与数据集的差值的平方
    diffMat = (np.array(dataSet) - np.array(inputVector))**2
    #计算每一行上元素的和并开方，得到距离向量
    distances = np.sqrt(diffMat.sum(axis=1))
    
    #要输出的label
    outputLabel = dict() 
    #label中每个分量的概率值
    probs = np.zeros((1,len(labels.keys())))
    #若距离中存在0，则直接拷贝距离为0对应元素的label
    if 0 in distances:
        zeroIndex = distances.tolist().index(0)
        for index, i in enumerate(labels.keys()):
            outputLabel[i] = labels[i][zeroIndex]
            probs[0,index] = outputLabel[i]
    else:
        #得到 K个近邻的下标
        kNeighborsIndex  = distances.argpartition(k-1)[0:k]
        #用label值除以距离并求和，更新输出的 label
        weight = 1/distances[kNeighborsIndex]
        
        for index, i in enumerate(labels.keys()):
            #得到 K个近邻的标签
            topKLabel = np.array(labels[i])[kNeighborsIndex]
            outputLabel[i] = (topKLabel*weight).sum()
            #保存当前概率值，用于后续归一化
            probs[0,index] = outputLabel[i]
    
    #将所有概率值的和调整为1
    for i in outputLabel.keys():
        outputLabel[i] = outputLabel[i] / probs.sum()
    return outputLabel

# knn_regress([[1,1,0,0],[0,1,1,1],[1,0,0,1]],
#             {'happy':[0.4,0.5,0.1], 
#              'sad':[0.2,0.3,0.2], 
#              'calm':[0.1,0.25,0.8]}, 2, [0,1,1,0])

a = oneHot_validateSet2[0]
knn_regress(oneHot_trainSet2, trainSet_label2, 2, a)

{'anger': 0.078309831097408536,
 'disgust': 0.10168682639590204,
 'fear': 0.45530074296635226,
 'joy': 0.064714169422640258,
 'sad': 0.19830160372179506,
 'surprise': 0.10168682639590204}

### 使用验证集调参

In [240]:
def run_knn_regress(k):
    '''使用验证集得到分类准确率，进行调参'''
    ansLabel = dict()
    for i in validateSet_label2.keys():
        ansLabel[i]=[]
    for index, wordVector in enumerate(oneHot_validateSet2):
        ans = knn_regress(oneHot_trainSet2, trainSet_label2, k, wordVector)
        ######### nan 值的debug代码#########################
#         flag = False
#         for i in validateSet_label2.keys():
#             if  np.isnan(ans[i]):
#                 print('nan value found in %d' % (index))
#                 print('terminate..')
#                 flag = True
#         if flag: break
        ########## nan 值的debug代码#########################
        for i in ans.keys():
            ansLabel[i].append(ans[i])
    
    print('k =%2d：' % k)
    tot = 0
    for i in ansLabel.keys():
        corr = np.corrcoef(ansLabel[i],validateSet_label2[i])[0,1]
        tot += corr
        print('%s:%.5f' % (i, corr), end=' ')
    print('\naverage:%.5f' % (tot/len(ansLabel.keys())))
for k in range(1,50):
    run_knn_regress(k)

k = 1
joy:0.22842 surprise:0.26274 fear:0.20246 sad:0.23562 disgust:0.06974 anger:0.17641 
average:0.19590
k = 2
joy:0.21322 surprise:0.26072 fear:0.18911 sad:0.25692 disgust:0.15061 anger:0.21899 
average:0.21493
k = 3
joy:0.36226 surprise:0.30836 fear:0.30815 sad:0.27993 disgust:0.17402 anger:0.26653 
average:0.28321
k = 4
joy:0.26868 surprise:0.29273 fear:0.24991 sad:0.30863 disgust:0.13378 anger:0.24317 
average:0.24948
k = 5
joy:0.28134 surprise:0.26604 fear:0.27038 sad:0.29301 disgust:0.19281 anger:0.21913 
average:0.25378
k = 6
joy:0.26018 surprise:0.26954 fear:0.25520 sad:0.29588 disgust:0.16202 anger:0.21047 
average:0.24221
k = 7
joy:0.24262 surprise:0.25867 fear:0.24043 sad:0.29207 disgust:0.12813 anger:0.21039 
average:0.22872
k = 8
joy:0.27573 surprise:0.25542 fear:0.27874 sad:0.31438 disgust:0.14025 anger:0.18576 
average:0.24171
k = 9
joy:0.28284 surprise:0.26718 fear:0.28126 sad:0.33442 disgust:0.11200 anger:0.19155 
average:0.24487
k =10
joy:0.27750 surprise:0.26591 fe

# 附录

## 参考资料

1.[stackoverflow : how-to-get-indices-of-n-maximum-values-in-a-numpy-array][1]

2.[stackoverflow : show-dataframe-as-table-in-ipython-notebook][2]

3.[Machine Learning-Normalization][3]

4.[为什么一些机器学习模型需要对数据进行归一化？][4]

5.[stackexchange : Standardizing some features in K-Means][5]

[1]:https://stackoverflow.com/questions/6910641/how-to-get-indices-of-n-maximum-values-in-a-numpy-array
[2]:https://stackoverflow.com/questions/26873127/show-dataframe-as-table-in-ipython-notebook
[3]:http://www.csuldw.com/2015/11/15/2015-11-15%20normalization/?utm_source=tuicool&utm_medium=referral
[4]:http://www.cnblogs.com/LBSer/p/4440590.html
[5]:https://stats.stackexchange.com/questions/223289/standardizing-some-features-in-k-means/223355#223355


## 相关函数测试

 **-------------------------------------------平台配置代码--------------------------------------------**

In [3]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
InteractiveShell.ast_node_interactivity = "all"

# from IPython.display import Image
# 其对应的三元顺序表为=Image("./images/1.jpg")
# 稀疏矩阵例子为=Image("./images/2.jpg")