# 垃圾邮件过滤

In [45]:
import numpy as np
import pandas as pd
import re
import random

In [54]:
# 分词
def textParse(input_string):
    # 分词
    listofTokens = re.split(r'\W+', input_string)
    # 转成小写单词
    return [tok.lower() for tok in listofTokens if len(listofTokens) > 2]

In [47]:
# 创建单词表（去重）
def creatVocablist(doclist):
    vocabSet = set([])
    for document in doclist:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [48]:
# 将文本向量化
def setOfWord2Vec(vocablist, inputSet):
    returnVec = [0]*len(vocablist)
    for word in inputSet:
        if word in vocablist:
            returnVec[vocablist.index(word)] = 1
    return returnVec

In [58]:
def trainNB(trainMat, trainClass):
    numTrainDocs = len(trainMat)
    numWords = len(trainMat[0])
    # 计算垃圾邮件的后验概率
    p1 = sum(trainClass) / float(numTrainDocs)
    
    # 垃圾/正常邮件中对应的词出现的频率
    p0Num = np.ones((numWords))
    p1Num = np.ones((numWords))
    
    # 条件概率的分母  初始化为类别个数
    p0Denom = 2
    p1Denom = 2
    
    for i in range(numTrainDocs):
        # 垃圾邮件
        if trainClass[i] == 1:
            p1Num += trainMat[i]
            p1Denom += sum(trainMat[i])
        else:
            p0Num += trainMat[i]
            p0Denom += sum(trainMat[i])
            
    p1Vec = np.log(p1Num / p1Denom)
    p0Vec = np.log(p0Num / p0Denom)
    
    return p0Vec, p1Vec, p1

In [50]:
def classifyNB(wordVec, p0Vec, p1Vec, p1_class):
    p1 = np.log(p1_class) + sum(wordVec * p1Vec)
    p0 = np.log(1.0 - p1_class) + sum(wordVec * p0Vec)
    if p0 > p1:
        return 0
    else:
        return 1
    

In [112]:
def spam():
    # 每封邮件对应分词列表
    doclist = []
    # 每封邮件对应类别列表
    classlist = []
    
    # 循环读取邮件文件，并分词
    for i in range(1, 26):
        # 读取邮件，并分词
        wordlist = textParse(open('email/spam/%d.txt' % i, "r").read())
        doclist.append(wordlist)
        # 1表示垃圾邮件
        classlist.append(1)
        
        wordlist = textParse(open('email/ham/%d.txt' % i, "r").read())
        doclist.append(wordlist)
        # 0表示正常邮件
        classlist.append(0)
    
    # 单词语料表
    vocablist = creatVocablist(doclist)
    
    # 随机划分训练和测试集
    trainSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainSet)))
        testSet.append(trainSet[randIndex])
        del (trainSet[randIndex])
    
    # 准备训练数据
    trainMat = []
    trainClass = []
    # 遍历训练集，将文本向量化
    for docIndex in trainSet:
        trainMat.append(setOfWord2Vec(vocablist, doclist[docIndex]))
        trainClass.append(classlist[docIndex])
    
    p0Vec, p1Vec, p1 = trainNB(np.array(trainMat), np.array(trainClass))
    
    # 测试数据
    # 错误分类样本数量
    errorCount = 0
    
    # 循环测试样本
    for docIndex in testSet:
        wordVec = setOfWord2Vec(vocablist, doclist[docIndex])
        if classifyNB(np.array(wordVec), p0Vec, p1Vec, p1) != classlist[docIndex]:
            errorCount += 1
    print("当前10个样本， 错误了：", errorCount)
    
if __name__ == "__main__":
    spam()

当前10个样本， 错误了： 2
