## **演示1102：垃圾邮件分类**

### **问题提出**
* 在【message.csv】文本中，包含有大量的短信。每行数据包括两个字段：短信内容，以及该短信是否垃圾信息的标志(1或0)。
* 现在希望将这些数据拆分成训练数据集和测试数据集，根据训练数据集做出一个预测模型；然后在测试数据集上来验证其效果。

### **案例1：使用sklearn.naive_bayes.MultinomialNB**
* MultinomialNB对象的$\alpha$属性，可以用于设置或获取相应的平滑参数值

In [2]:
''' 使用MultinomialNB实现垃圾短信分类，并计算模型效能 '''

import numpy as np
import csv
import re
from sklearn.naive_bayes import MultinomialNB

# 将一条文本拆分成包含多个单词的一维数组，重复的单词只保存1个
def tokenize(message):
    message = message.lower()         
    all_words = re.findall("[a-z0-9']+", message)
    return set(all_words)    # 去重，返回诸如{"aaa","bbb"...}形式

# 读取csv文件数据并拆分成训练数据和测试数据
split_ratio = 0.75          # 75%的训练数据
training_data = []
testing_data = []
np.random.seed(0)
with open('messages.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    data = training_data, testing_data
    for row in reader:
        if reader.line_num == 1:            # 跳过第一行
            continue  
        words = tokenize(row[0])            # row[0]代表文本，row[1]代表分类
        row_data = [words, (int)(row[1])]   # 形成诸如：({"aaa","bbb"...},1)的形式
        data[0 if np.random.random() < split_ratio else 1].append(row_data)   
    
# 构造词汇表，并形成Feature矩阵和Classify矩阵
temp_dict = set()                       # 使用set暂存单词，以便能够去掉重复单词
for row in training_data:
    temp_dict = temp_dict | row[0]
word_dict = list(temp_dict)             # 将set转成list，以便使用index来查找某个元素的下标索引
num_features = len(word_dict)

# 根据训练数据或测试数据，生成feature矩阵和classify矩阵
# data来自从csv文件读取的结果
def generateMat(data):
    num_samples = len(data)
    feature = np.zeros((num_samples, num_features))
    classify = np.zeros(num_samples)
    for i in range(num_samples):
        data_row = data[i]
        classify[i] = data_row[1]
        for word in data_row[0]:
            if word in word_dict:            # 测试数据中可能有些单词不在单词表中，要去掉这些词
                feature[i][word_dict.index(word)] = 1
    return feature, classify

training_feature, training_classify = generateMat(training_data)
testing_feature, testing_classify = generateMat(testing_data)
print("训练矩阵特征维度：", training_feature.shape)
print("测试矩阵特征维度：", testing_feature.shape)

model = MultinomialNB(alpha=1.0)
model.fit(training_feature, training_classify)
predict_classify = model.predict(testing_feature)

TN = FP = TP = FN = 0
for i in range(len(predict_classify)):
    if testing_classify[i] == 0 and predict_classify[i] == 0:
        TN += 1
    if testing_classify[i] == 0 and predict_classify[i] == 1:
        FP += 1
    if testing_classify[i] == 1 and predict_classify[i] == 1:
        TP += 1
    if testing_classify[i] == 1 and predict_classify[i] == 0:
        FN += 1

p = TP / (TP + FP)
r = TP / (TP + FN)
print("Accuracy：", (TN + TP)/(TN + TP + FN + FP))
print("Precision：",p)
print("Recall：", r)
print("F1 Score:", 2 * p * r / (p + r))

训练矩阵特征维度： (2418, 3746)
测试矩阵特征维度： (850, 3746)
Accuracy： 0.9423529411764706
Precision： 0.8924731182795699
Recall： 0.680327868852459
F1 Score: 0.7720930232558139


### **案例2：自定义实现分类**
* 检查【naive_bayes_classifier.py】文件中的自定义朴素贝叶斯分类器

In [3]:
''' 自定义贝叶斯分类器，对垃圾短信分类 '''

import numpy as np
import csv
import re
from collections import Counter
import naive_bayes_classifier

# 将一条文本拆分成包含多个单词的一维数组，重复的单词只保存1个
def tokenize(message):
    message = message.lower()         
    all_words = re.findall("[a-z0-9']+", message)
    return set(all_words)    # 去重，返回诸如{"aaa","bbb"...}形式

# 读取csv文件数据并拆分成训练数据和测试数据
split_ratio = 0.75          # 75%的训练数据
training_data = []
testing_data = []
np.random.seed(0)
with open('messages.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    data = training_data, testing_data
    for row in reader:
        if reader.line_num == 1:            # 跳过第一行
            continue  
        words = tokenize(row[0])            # row[0]代表文本，row[1]代表分类
        row_data = [words, (int)(row[1])]   # 形成诸如：({"aaa","bbb"...},1)的形式
        data[0 if np.random.random() < split_ratio else 1].append(row_data)   
    
# 构造词汇表，并形成Feature矩阵和Classify矩阵
temp_dict = set()                       # 使用set暂存单词，以便能够去掉重复单词
for row in training_data:
    temp_dict = temp_dict | row[0]
word_dict = list(temp_dict)             # 将set转成list，以便使用index来查找某个元素的下标索引
num_features = len(word_dict)

# 根据训练数据或测试数据，生成feature矩阵和classify矩阵
# data来自从csv文件读取的结果
def generateMat(data):
    num_samples = len(data)
    feature = np.zeros((num_samples, num_features))
    classify = np.zeros(num_samples)
    for i in range(num_samples):
        data_row = data[i]
        classify[i] = data_row[1]
        for word in data_row[0]:
            if word in word_dict:            # 测试数据中可能有些单词不在单词表中，要去掉这些词
                feature[i][word_dict.index(word)] = 1
    return feature, classify

# 生成训练数据矩阵和测试数据矩阵
training_feature, training_classify = generateMat(training_data)
testing_feature, testing_classify = generateMat(testing_data)
print("训练矩阵特征维度：", training_feature.shape)
print("测试矩阵特征维度：", testing_feature.shape)

classifier = naive_bayes_classifier.BinaryNaiveBayesClassifier()
classifier.train(training_feature, training_classify)

predict_classify = np.zeros(len(testing_feature))
for i in range(len(testing_feature)):
    predict_classify[i] = classifier.classify(testing_feature[i])

# 计算各项指标
TN = FP = TP = FN = 0
for i in range(len(predict_classify)):
    if testing_classify[i] == 0 and predict_classify[i] < 0.5:
        TN += 1
    if testing_classify[i] == 0 and predict_classify[i] >= 0.5:
        FP += 1
    if testing_classify[i] == 1 and predict_classify[i] >= 0.5:
        TP += 1
    if testing_classify[i] == 1 and predict_classify[i] < 0.5:
        FN += 1

p = TP / (TP + FP)
r = TP / (TP + FN)
print("Accuracy：", (TN + TP) / (TN + TP + FN + FP))
print("Precision：",p)
print("Recall：", r)
print("F1 Score:", 2 * p * r / (p + r))

训练矩阵特征维度： (2418, 3746)
测试矩阵特征维度： (850, 3746)
Accuracy： 0.9423529411764706
Precision： 0.8924731182795699
Recall： 0.680327868852459
F1 Score: 0.7720930232558139
