In [1]:
import numpy as np
import re

In [2]:
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [3]:
def setOfWords2Vec(vocabList, inputSet):
    """
    把句子嵌入到词汇表中，也就是用表达一个句子
    向量中为1，代表词汇表对应该处的词汇；反之，不对应该处词汇
    """
    returnVec = [0]*len(vocabList)#初始化返回句子向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]==1
        else:
            print("the word: %s is not in my VocabLary!"%word)
        return returnVec

In [4]:
def bagOfWords2VecMN(vocabList, inputSet):
    """
    文档的词袋模型
    参数：
        vocabList -- 词汇列表
        inputSet -- 输入词集
    返回：
        returnVec -- 返回向量
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]+=1
    return returnVec

In [18]:
def trainNB(trainMatrix, trainCategory):
    """
    训练朴素贝叶斯分类器
    参数：
        trainMatrix -- 输入的训练矩阵
        trainCategory -- 训练标签
    返回：
        p0Vect -- 非侮辱性词条概率
        p1Vect -- 侮辱性词条概率
        pAbusive -- 侮辱性概率
    """
    numTrainDocs = len(trainMatrix)
#     print(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs) #激烈言辞
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)
    return p0Vect, p1Vect, pAbusive


概率的计算采用贝叶斯公式 
$$p(c_{i}|W)=\frac{p(W|c_{i})p(c_{i})}{p(W)}$$
又根据朴素假设，所有词汇概率统计独立 $$p(W|c_{i})=p(w_{0},w_{1},w_{2},...,w_{n}|c_{i})=p(w_{0}|c_{i})\cdot p(w_{1}|c_{i})\cdot p(w_{2}|c_{i})\cdot ...\cdot p(w_{n}|c_{i})$$

In [19]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    """
    计算概率，比较概率，得出结果
    由于所有概率均取了对数，所以乘法变加法，除法变减法
    由于只需要比较相对大小，相同的p(W)也就不用除了
    参数：
        vec2Classify -- 要分类的文本向量
        p0Vec -- p0向量
        p1Vec -- p1向量
        pClass1 -- 类别1的概率
    返回：
        判断结果 -- 0或1
    """
    p1 = sum(vec2Classify * p1Vec)+ np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec)+np.log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0

### 过滤垃圾邮件

In [7]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

In [16]:
def spamTest():
    docList= []
    classList=[]
    fullText = []
    for i in range(1, 26): #遍历垃圾与正常邮件各25个
        wordList = textParse(open('email/spam/%d.txt'%i, encoding='ISO-8859-1').read())
        #列表
        docList.append(wordList)
        fullText.extend(wordList)
        #类
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt'%i, encoding='ISO-8859-1').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    #训练集和测试集序号集
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        #序号对应的元素由训练集移动到测试集中
        testSet.append(trainingSet[randIndex])
        del(list(trainingSet)[randIndex])
    trainMat = []
    trainClasses=[]
    #对应训练集中的标签
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        #词袋向量
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount +=1
            print("classification error", docList[docIndex])
    print("the error rate is:", float(errorCount)/len(testSet))

In [20]:
spamTest()

the error rate is: 0.0
