In [1]:
import numpy as np
import os
import re


def load_data():
    files = []
    y = []
    for cate in os.listdir('邮件数据'):
        for file in os.listdir('邮件数据/' + cate):
            with open('邮件数据/' + cate + '/' + file) as fr:
                #print('邮件数据/' + cate + '/' + file)
                lines = []
                for line in fr.readlines():
                    line = line.strip()
                    line = [
                        word for word in re.split('\W', line) if len(word) >= 2
                    ]
                    lines.extend(line)
                files.append(lines)
                y.append(1 if cate == '垃圾' else 0)

    #print(files[0])

    #求所有单词的集合
    words = []
    for lines in files:
        words.extend(lines)
    words = list(set(words))

    #句子数字化,统计每个词在每个句子中出现的次数
    x = np.zeros((len(files), len(words)))
    for i in range(len(files)):
        for j in range(len(files[i])):
            if files[i][j] in words:
                x[i, words.index(files[i][j])] += 1

    #1是违规言论,0合法言论
    y = np.array(y)
    return x, y


x, y = load_data()
x, y, x.shape, y.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]),
 (50, 820),
 (50,))

In [2]:
def train(x, y):

    #首先求总体的违规率,等于 违规次数 / 总次数
    p1 = y.sum() / len(y)
    p0 = 1 - p1

    #取对数概率
    p1 = np.log(p1)
    p0 = np.log(p0)

    #根据y的值,切分x为正例和反例
    x_1 = x[y == 1]
    x_0 = x[y == 0]

    #统计在正例中,所有词出现的次数,当然在一个句子中出现多次也只算1次
    #也就是说,是每个词出现的句子数
    #最后要加1是为了避免0的情况,也就是说,所有词最少出现1次
    p1_given_word = x_1.sum(axis=0) + 1

    #上面统计的是次数,除以总次数就等于概率了.
    #也就是每个词,出现在正例句子中的概率
    p1_given_word = p1_given_word / p1_given_word.sum()

    #取对数概率
    p1_given_word = np.log(p1_given_word)

    #p0_given_word的计算同理
    p0_given_word = x_0.sum(axis=0) + 1
    p0_given_word = p0_given_word / p0_given_word.sum()
    p0_given_word = np.log(p0_given_word)

    #取对数概率
    return p1_given_word, p0_given_word, p1, p0


p1_given_word, p0_given_word, p1, p0 = train(x, y)
p1_given_word.shape, p0_given_word.shape, p1, p0

((820,), (820,), -0.6931471805599453, -0.6931471805599453)

In [3]:
#测试
def pred(x):
    #本来p1_given_x应该是每个单词属于p1的概率连乘, 但是因为前面取了对数, 所以这里求和就可以了
    p1_given_x = x.dot(p1_given_word) + p1
    p0_given_x = x.dot(p0_given_word) + p0

    return 1 if p1_given_x > p0_given_x else 0


correct = 0
for xi, yi in zip(x, y):
    if pred(xi) == yi:
        correct += 1

correct / len(x)

1.0