In [1]:
import re
import random
from numpy import *

#tokenization,分词
def textParse(s):
    tokens = re.split(r'\W*', s)
    #转化成小写，且只取长度大于2的单词
    return [tok.lower() for tok in tokens if len(tok) > 2]

#通过set来创建无重复单词的字典
def createVocab(fullText):
    return list(set(fullText))

#将一个单词list转化为向量表示, 某单词存在，则把vocabs中对应位置赋为1(此处应用的是伯努利模型，即不考虑次数，只考虑是否出现)
def words2Vec(vocabs, words):
    vec = [0 for _ in range(len(vocabs))]
    for word in words:
        if word in vocabs:
            vec[vocabs.index(word)] = 1
    return vec
        
#训练过程
def trainNB(trainMat, trainClasses):
    numDocs = len(trainMat)
    numWords = len(trainMat[0])
    #垃圾邮件的概率
    pSpam = sum(trainClasses)/float(numDocs)
    #分子p0是类别为0的概率,add-1 smmothing
    p0num = ones(numWords)
    p1num = ones(numWords)
    #分母,add-1 smoothing
    p0denom = 2.0
    p1denom = 2.0
    for i in range(numDocs):
        #计算1类的分子分母,0类的分子分母
        if trainClasses[i] == 1:
            p1num += trainMat[i]
            p1denom += sum(trainMat[i])
        else:
            p0num += trainMat[i]
            p0denom += sum(trainMat[i])
    #计算概率，p1,p0,这里两者均为向量表示，每个位置时该位置对应的单词的概率p1[i] = p(wi|c=1), p0[i] = p(wi|c=0)
    #取自然对数是为了转化乘法为加法，防止向下溢出
    p1 = log(p1num/p1denom)
    p0 = log(p0num/p0denom)
    return p0, p1, pSpam

#分类过程,传入的概率p0和p1都是取了自然对数的,pSpam没有取
def classifyNB(wordvec,p0vec,p1vec,pSpam):
    p1 = sum(wordvec*p1vec) + log(pSpam)
    p0 = sum(wordvec*p0vec) + log(1.0-pSpam)
    #返回概率大的类别
    if p1 > p0:
        return 1
    else:
        return 0

    
def spamTest():
    docs = []
    classes = []
    fullText = []
    #总共有25个正例,25个反例
    for i in range(1,26):
        #每封邮件作为一个大字符串，使用textParse分词放入list
        #docs存放每一封分词过后的邮件,fullText存放所有的单词,classes存放类别（spam中是正类）
        words = textParse(open('email/spam/%d.txt'%i).read())
        docs.append(words)
        fullText.extend(words)
        classes.append(1)
        #同理求负例
        words = textParse(open('email/ham/%d.txt'%i,encoding='gbk').read())
        docs.append(words)
        fullText.extend(words)
        classes.append(0)
    #构建字典
    vocabs = createVocab(fullText)
    #总共50封邮件，随机选择10封作为测试集，剩余40封为训练集，trainIndex和testIndex存的是选取的邮件的index
    trainIndex = [i for i in range(50)]
    testIndex = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainIndex)))
        testIndex.append(trainIndex[randIndex])
        del trainIndex[randIndex]
    #对于训练集，将每封邮件的单词列表转化成向量表示， 并存入相应的list
    trainMat = []
    trainClasses = []
    for index in trainIndex:
        trainMat.append(words2Vec(vocabs, docs[index]))
        trainClasses.append(classes[index])
    #训练模型，得到条件概率向量，以及先验概率pSpam
    p0, p1, pSpam = trainNB(array(trainMat),array(trainClasses))
    #在测试集上测试
    errorCount = 0
    for index in testIndex:
        wordvec = words2Vec(vocabs, docs[index])
        if classifyNB(array(wordvec), p0, p1, pSpam) != classes[index]:
            errorCount += 1
    print('error rate is:', float(errorCount)/len(testIndex))
    
spamTest()
        

  return _compile(pattern, flags).split(string, maxsplit)


error rate is: 0.1


In [4]:
#使用sklearn工具包进行分类
from sklearn.naive_bayes import MultinomialNB

def spamNBsklearn():
    #数据准备过程同上
    docs = []
    classes = []
    fullText = []
    for i in range(1,26):
        words = textParse(open('email/spam/%d.txt'%i).read())
        docs.append(words)
        fullText.extend(words)
        classes.append(1)
        #同理求负例
        words = textParse(open('email/ham/%d.txt'%i,encoding='gbk').read())
        docs.append(words)
        fullText.extend(words)
        classes.append(0)
    #构建字典
    vocabs = createVocab(fullText)
    #总共50封邮件，随机选择10封作为测试集，剩余40封为训练集，trainIndex和testIndex存的是选取的邮件的index
    trainIndex = [i for i in range(50)]
    testIndex = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainIndex)))
        testIndex.append(trainIndex[randIndex])
        del trainIndex[randIndex]
    #对于训练集，将每封邮件的单词列表转化成向量表示， 并存入相应的list
    trainMat = []
    trainClasses = []
    for index in trainIndex:
        trainMat.append(words2Vec(vocabs, docs[index]))
        trainClasses.append(classes[index])
    #对于测试集，将每封邮件的单词列表转化成向量表示， 并存入相应的list
    testMat = []
    testClasses = []
    for index in testIndex:
        testMat.append(words2Vec(vocabs, docs[index]))
        testClasses.append(classes[index])
    #使用sklearn包训练（使用多项式模型）
    clf = MultinomialNB()
    clf.fit(trainMat, trainClasses)
    #test, clf.score 输出对测试样本的预测准确率平均值
    score = clf.score(testMat, testClasses)
    print('error rate is:', 1-score)

spamNBsklearn()


error rate is: 0.09999999999999998


  return _compile(pattern, flags).split(string, maxsplit)
