# 使用朴素贝叶斯分类器对邮件分类

使用数据集： Enrom-Spam

In [1]:
datadir = '/home/tarena/class/book_and_code/Machine_Learning_for_OpenCV/opencv-machine-learning/notebooks/data/chapter7'

In [2]:
HAM = 0
SPAM = 1

sources = [
    ('beck-s.tar.gz', HAM),
    ('farmer-d.tar.gz', HAM),
    ('kaminski-v.tar.gz', HAM),
    ('kitchen-l.tar.gz', HAM),
    ('lokay-m.tar.gz', HAM),
    ('williams-w3.tar.gz', HAM),
    ('BG.tar.gz', SPAM),
    ('GP.tar.gz', SPAM),
    ('SH.tar.gz', SPAM)
]

In [None]:
# 解压、提取文件
# def extract_tar(datafile, extractdir):
#     try:
#         import tarfile
#     except ImportError:
#         raise ImportError("do not have tarfile installed.")
#     tar = tarfile.open(datafile)
#
#     tar.extractall(path=extractdir)
#     tar.close()
#     print("%s successfully extracted to %s" % (datafile, extractdir))
#
#
# for source, _ in sources:
#     # print(source)
#     datafile = '%s/%s' % (datadir, source)
#     # print(datafile)
#     extract_tar(datafile, datadir)

In [3]:
# 定义函数：用于读取文件内容
import os
def read_single_file(filename):
    """
    从filename文件中提取内容
    """
    past_header, lines = False, []

    if os.path.isfile(filename):
        f = open(filename, encoding="latin-1")
        for line in f:
            if past_header:
                lines.append(line)
            elif line == '\n':
                past_header = True
    f.close()

    content = '\n'.join(lines)
    return filename, content

def read_files(path):
    """
    从path目录中的所有文件提取内容
    """
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(root, filename)
            yield read_single_file(filepath)

### 使用pandas构建数据矩阵

In [7]:
import pandas as pd

# pandas 演示
pd.DataFrame({
    'model': [
        'Normal Bayes',
        'Multinamial Bayes',
        'Bernoulli Bayes'
    ],
    'class': [
        'cv2.ml.NormalBayesClassifier_create()',
        'sklearn.naive_bayes.MultinomialNB()',
        'sklearn.naive_bayes.BernoulliNB()'
    ]
})

Unnamed: 0,model,class
0,Normal Bayes,cv2.ml.NormalBayesClassifier_create()
1,Multinamial Bayes,sklearn.naive_bayes.MultinomialNB()
2,Bernoulli Bayes,sklearn.naive_bayes.BernoulliNB()


In [9]:
def build_data_frame(extractdir, classification):
    rows = []
    index = []
    for file_name, text in read_files(extractdir):
        rows.append(({'text': text, 'class': classification}))
        index.append(file_name)

    data_frame = pd.DataFrame(rows, index=index)
    return data_frame


data = pd.DataFrame({'text': [], 'class': []})
for source, classification in sources:
    extractdir = '%s/%s' % (datadir, source[:-7])
    data = data.append(build_data_frame(extractdir, classification))

## 数据预处理
- CountVectorizer 统计各个单词出现的次数

In [10]:
from sklearn import feature_extraction
counts = feature_extraction.text.CountVectorizer()  # 统计各单词出现次数
x = counts.fit_transform(data['text'].values)
print(x.shape)  # (52076, 643270)
y = data['class'].values

(52076, 643270)


In [11]:
x

<52076x643270 sparse matrix of type '<class 'numpy.int64'>'
	with 8607632 stored elements in Compressed Sparse Row format>

## 训练贝叶斯分类器
scikit-learn将数据保存在一个系数矩阵（x）中
- opencv无法处理稀疏矩阵

使用完整的数据集进行训练
- 使用sklearn
- naive_bayes.MultinomialNB 最适合处理分类数据（比如单词技术）的朴素贝叶斯分类器版本

In [12]:
from sklearn import model_selection as ms
x_train, x_test, y_train, y_test = ms.train_test_split(
    x, y, test_size=0.2, random_state=42
)
from sklearn import naive_bayes
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(x_train, y_train)
print(model_naive.score(x_train, y_train))  # 0.9511281805088814
print(model_naive.score(x_test, y_test))  # 0.9452764976958525

0.9511281805088814
0.9452764976958525


# 使用n-gram提升结果——重复数据分割和训练器分类过程
- 缺点：对于更长的文本无法有效的评估权重

In [13]:
counts = feature_extraction.text.CountVectorizer(ngram_range=(1, 2))  # 统计n个单词的长短句

x = counts.fit_transform(data['text'].values)
x_train, x_test, y_train, y_test = ms.train_test_split(
    x, y, test_size=0.2, random_state=42
)
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(x_train, y_train)
print(model_naive.score(x_train, y_train))  # 0.9835333653384541
print(model_naive.score(x_test, y_test))  # 0.9713901689708141

0.9835333653384541
0.9713901689708141


In [14]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, model_naive.predict(x_test)))

[[3824    6]
 [ 292 6294]]


# 使用TD-IDF提升结果
- 通过计算单词在整个数据集中出现的频率来对单词技术分配权重

In [15]:
tdidf = feature_extraction.text.TfidfTransformer()

x_new = tdidf.fit_transform(x)
x_train, x_test, y_train, y_test = ms.train_test_split(
    x_new, y, test_size=0.2, random_state=42
)

model_naive = naive_bayes.MultinomialNB()
model_naive.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
print(model_naive.score(x_train, y_train))  # 0.9958233317330772
print(model_naive.score(x_test, y_test))  # 0.9906874039938556

0.9958233317330772
0.9906874039938556


In [17]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, model_naive.predict(x_test)))
# [[3738   92]    3728个类0，分类正确,； 92个类0,错误分类为类1
#  [   5 6581]]   5个类1，错误分类为类0； 6581个类1，分类正确

[[3738   92]
 [   5 6581]]
