In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tflearn.layers.normalization import local_response_normalization
import os
import numpy as np
import tensorflow as tf
import tflearn
from sklearn.model_selection import train_test_split
from  sklearn import naive_bayes
import sklearn.metrics as metrics

Instructions for updating:
non-resource variables are not supported in the long term
curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
#将整个邮件当成一个字符串处理，其中回车和换行需要过滤掉
def load_one_file(filename):
    x=""
    with open(filename,'r',encoding='latin-1') as f:  #因为一些邮件没有使用统一码，这条语句试图正确解码文件。
        for line in f:
            line=line.strip('\n')
            line=line.strip('\r')
            x+=line
    return x

In [3]:
#遍历指定文件夹下所有文件，加载数据
def load_files_from_dir(rootdir):
    x=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            v=load_one_file(path)
            x.append(v)
    return x

In [4]:
#加载所在的文件夹，正常邮件在ham中，垃圾邮件在spam中。

datadir = './data_6'

def load_all_files():
    ham = []
    spam = []
    for i in range(1,7):
        path = datadir + '/enron%d/ham/' %i
        print("Load %s"%path)
        ham += load_files_from_dir(path)
        
        path = datadir + '/enron%d/spam/' %i
        print("Load %s"%path)
        spam += load_files_from_dir(path)
        
    return ham, spam       ##正常文件：ham；垃圾邮件：spam

In [5]:
#使用词袋模型，向量化邮件样本，ham标记为0，spam标记为1
def get_features_by_wordbag():
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vectorizer = CountVectorizer(
                                 decode_error='ignore', #处理解码失败的方式
                                 strip_accents='ascii',
                                 max_features=max_features,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print(vectorizer)
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    return x,y

#使用词袋模型+TF-IDF
def get_features_by_wordbag_tfidf():
    ham, spam = load_all_files()
    x = ham + spam
    y = [0] * len(ham) + [1] * len(spam)
    vectorizer = CountVectorizer(
    decode_error = 'ignore',  #处理解码失败的方式
    strip_accents = 'ascii',  #在预处理步骤中移除重音的方式
    max_features = max_features, #词袋特征个数的最大值
    stop_words = 'english',  #判断word结束的方式
    max_df = 1.0,  #df最大值
    min_df = 1,  #df最小值
    binary = True) #默认为False，与TF-IDF结合时需要设置为True
    x = vectorizer.fit_transform(x)
    x = x.toarray()
    transformer = TfidfTransformer(smooth_idf = False)
    tfidf = transformer.fit_transform(x)
    x = tfidf.toarray()
    return x, y

In [6]:
#构建贝叶斯模型
def do_nb_wordbag(x_train, x_test, y_train, y_test):
    print("NB and wordbag")
    gnb = naive_bayes.GaussianNB()
    gnb.fit(x_train,y_train)
    y_pred=gnb.predict(x_test)
    print(metrics.accuracy_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

In [9]:
if __name__ == "__main__":
    max_features=5000
    max_document_length=100
    
    print("Hello spam-mail")
    #print("get_features_by_wordbag")
    #x,y=get_features_by_wordbag()
    print("get_features_by_wordbag_tfidf")
    
    x,y=get_features_by_wordbag_tfidf()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)  #测试集比例为20%
    do_nb_wordbag(x_train, x_test, y_train, y_test)

Hello spam-mail
get_features_by_wordbag_tfidf
Load ./data_6/enron1/ham/
Load ./data_6/enron1/spam/
Load ./data_6/enron2/ham/
Load ./data_6/enron2/spam/
Load ./data_6/enron3/ham/
Load ./data_6/enron3/spam/
Load ./data_6/enron4/ham/
Load ./data_6/enron4/spam/
Load ./data_6/enron5/ham/
Load ./data_6/enron5/spam/
Load ./data_6/enron6/ham/
Load ./data_6/enron6/spam/
NB and wordbag
0.9595195729537367
[[3211   43]
 [ 230 3260]]
