In [1]:
import numpy as np

# 数据创建

In [7]:
def loadDataSet():
    """
    函数说明:创建训练数据
    Parameters:
    Return: 
            postingList:句子切分后的的词语
            classVector:每一个句子的类别
    """
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],  #切分的词条
                ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1] #类别标签向量，1代表侮辱性词汇，0代表不是
    return postingList,classVector

In [28]:
def createWordSet(wordLists):
    """
    函数说明:创建无重复的词库表
    Parameters:
            wordList:用来创建词表的所有语句
    Return: 
            wordSet:无重复的词表
    """
    wordSet = set([])
    for wordList in wordLists:
        wordSet = wordSet|set(wordList)
    return list(wordSet)

In [42]:
def createWordVector(wordSet,wordList):
    """
    函数说明:创建一句话对应的词向量
    Parameters:
            wordSet:词库表
            wordList:句子中的词语组成的列表
    Return:
            wordVector:句子对应的词向量表
    """
    wordVectorLength = len(wordSet)
    wordVector = np.zeros([wordVectorLength])
    for index,word in enumerate(wordSet):
        if word in wordList:
            wordVector[index]=1
    return wordVector        

In [43]:
#调用上方函数测试数据
# 创建数据
postingList,classVector = loadDataSet()
# 创建词库表
wordSet = createWordSet(postingList)

In [50]:
def createWordVecors(wordSet,wordLists):
    """
    函数说明:创建所有词语对应的向量矩阵
    Parameters:
            wordSet:词库表
            wordListｓ:句子中的词语组成的列表的集合
    Return:
            wordVector:所有句子对应的词向量表
    """
    wordVectors = []
    for wordList in wordLists:
        wordVector = createWordVector(wordSet,wordList)
        wordVectors.append(wordVector)
    return wordVectors

In [53]:
# 测试
wordVecors=createWordVecors(wordSet,postingList)

In [59]:
def trainNivaBayes(wordVectors,classVector):
    """
    函数说明：计算是侮辱性句子的概率，以及侮辱句子和非侮辱性句子中的词语比例
    Parameters:
            wordVectors:词向量组成的矩阵
            classVector:类别组成的向量
    Return:
            yesVector/numberOfYes：一个词语在侮辱性句子中所占的概率
            noVector/numberOfNo：一个词语在非侮辱性句子中所占的概率
            yesProbability:侮辱性句子的概率
    """
    
    vectorLength = len(wordVectors[0])
    # 是侮辱性句子的概率
    yesProbability = sum(classVector)/float(len(classVector))
    # 总的侮辱性句子中词语的个数
    numberOfYes = 0.0
    # 总的非侮辱性句子中词语的个数
    numberOfNo = 0.0
    # 侮辱性句子的向量
    yesVector = np.zeros([vectorLength])
    #　不是侮辱性句子的向量
    noVector = np.zeros([vectorLength])
    for index,wordVector in enumerate(wordVectors):
        if classVector[index]==1:
            numberOfYes+=sum(wordVector)
            yesVector+=wordVector
        else:
            numberOfNo+=sum(wordVector)
            noVector+=wordVector
        
    return yesVector/numberOfYes,noVector/numberOfNo,yesProbability
    

In [60]:
trainNivaBayes(wordVecors,classVector)[0]

array([0.05263158, 0.        , 0.05263158, 0.15789474, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.05263158, 0.05263158, 0.        , 0.        ,
       0.        , 0.05263158, 0.10526316, 0.        , 0.05263158,
       0.05263158, 0.05263158, 0.05263158, 0.        , 0.        ,
       0.05263158, 0.05263158, 0.        , 0.10526316, 0.        ,
       0.        , 0.05263158])

In [34]:
len(postingList[0])

7

In [13]:
np.array(["nn","dd"]).shape

(2,)

In [21]:
for i,j in enumerate([6,5]):
    print(i)

0
1


In [38]:
np.zeros([7])

array([0., 0., 0., 0., 0., 0., 0.])