要求：

0、只验收NB分类

1、只需要实现多项式模型。

2、必须实现拉普拉斯平滑且平滑的分母加V表示为训练集词典的词个数。

3、输出标准为：两个情感的概率(可以用直接算出来的，不需要加起来为1)以及最终的prediction。

In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
import pandas as pd
from IPython.display import display

def loadDataSet(filePath):
    '''读取数据集函数'''
    #读取CSV文件
    df = pd.read_csv(filePath)
    #得到数据集标签
    label = list(df['label'].values)  
    #得到数据集
    dataSet = [i.strip().split(' ') for i in list(df['Words'].values)]
    #得到数据集的所有不重复的词
    allWords = list(set([j for i in dataSet for j in i]))
    
    ############输出数据集相关信息###########################
    #输出第一行数据
    print('【data preview】:')
    display(df)
    #输出所有label的分布
    print('【count of all kind of labels】:\n')
    print(df['label'].value_counts())
    #输出所有的词的个数
    print('【number of all words】: ', len(allWords))
    print('【number of texts】: ', len(df))
    ############输出数据集相关信息###########################
    
    return dataSet, label, allWords

In [10]:
trainSet, trainSet_label, allWords_trainSet = loadDataSet('train_set.csv')

【data preview】:


Unnamed: 0,Words,label
0,easy to finish the AI homework for you,joy
1,hard to finish the other homework for you,sad
2,many homework you should do,joy
3,few AI homework,sad
4,next week have new AI homework,joy


【count of all kind of labels】:

joy    3
sad    2
Name: label, dtype: int64
【number of all words】:  18
【number of texts】:  5


In [13]:
testSet, _ , allWords_testSet = loadDataSet('test_set.csv')

【data preview】:


Unnamed: 0,Words,label
0,you like AI homework,?


【count of all kind of labels】:

?    1
Name: label, dtype: int64
【number of all words】:  4
【number of texts】:  1


In [15]:
allWords_train_test = list(set(allWords_trainSet).union(allWords_testSet))
print(allWords_train_test)

['have', 'new', 'few', 'other', 'AI', 'like', 'week', 'finish', 'next', 'should', 'easy', 'the', 'you', 'homework', 'do', 'to', 'for', 'hard', 'many']


In [28]:
import numpy as np
from collections import Counter

def getFreq(trainSet, trainSet_label, allWordsNum_train,
            allWords_train_other, lambda_):
    '''得到输入训练集的频次矩阵、所有可能的标签、对应的标签的概率'''
    def calcProbs(numerator, denominator):
        '''根据输入的分子和分母计算先验概率值'''
        return (numerator+lambda_) / (denominator+lambda_*allWordsNum_train)
    #先将数据类型转为numpy.array
    trainSet = np.array(trainSet)
    trainSet_label = np.array(trainSet_label)
    
    #得到所有可能的标签
    allLabels = list(set(trainSet_label))
    freqMat = dict() #要返回的频次矩阵
    labelPros = dict() #要返回的标签的概率
    #遍历所有标签
    for label in allLabels:
        #找到所有标签为 label 的 documents
        allDocs_with_label = trainSet[np.argwhere(trainSet_label==label)[:,0]]
        #计算每个 label 出现的概率
        labelPros[label] = len(allDocs_with_label)/len(trainSet)
        #得到上面的所有 documents 中的所有词
        allWords_with_label = [j for i in list(allDocs_with_label) for j in i]
        #词频统计
        wordCounter = Counter(allWords_with_label)
        #预分配内存
        freqMat[label] = [0]*len(allWords_train_other)
        #求先验概率时的分母
        denominator_ = len(allWords_with_label)
        #遍历所有词，计算在标签为label的前提下词出现的概率
        for index, word in enumerate(allWords_train_other):
            freqMat[label][index] = calcProbs(wordCounter.get(word,0), denominator_)
            
    return freqMat, allLabels, labelPros

from functools import reduce

def NB_classify(trainSet, trainSet_label, allWordsNum_train, allWords_train_other,
                otherSet, lambda_):
    #得到频次矩阵、所有可能的标签、标签概率
    freqMat_, allLabels_, labelPros_ = getFreq(trainSet, trainSet_label, 
                    allWordsNum_train,allWords_train_other, lambda_=lambda_)
    ans = [] #存储每行数据对应的最终预测分类
    allWords_ = np.array(allWords_train_other)
    print("所有词：\n", allWords_)
    for data in otherSet:
        #得到当前的一行数据中词在所有词向量allWords_的下标
        matchIndex = np.array([np.argwhere(allWords_ == word)[0][0] 
                                                    for word in data])
        temp = []
        print("当前测试文本：", data)
        print("在所有词中下标：", matchIndex)
        for label in allLabels_:
            #得到对应label的所有词的概率列表
            probs = np.array(freqMat_[label])
            #print(probs)
            #得到基于label的概率值
            p = reduce(lambda x,y:x*y, probs[matchIndex]) * labelPros_[label]
            #保存概率结果和对应的label值
            temp.append((p, label))
        #得到概率值最大时对应的label值
        print("\n分为不同类的概率：", temp)
        ans.append(max(temp)[1])
    #返回结果
    return ans

#调用 NB 分类函数
predictLabel = NB_classify(trainSet, trainSet_label, len(allWords_trainSet),
                           allWords_train_test, testSet, lambda_ = 1)
predictLabel   

所有词：
 ['have' 'new' 'few' 'other' 'AI' 'like' 'week' 'finish' 'next' 'should'
 'easy' 'the' 'you' 'homework' 'do' 'to' 'for' 'hard' 'many']
当前测试文本： ['you', 'like', 'AI', 'homework']
在所有词中下标： [12  5  4 13]

分为不同类的概率： [(6.7865530107552721e-06, 'sad'), (1.1525157123640927e-05, 'joy')]


['joy']