In [1]:
# coding: utf-8
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix
from tqdm import tqdm
from math import *

In [2]:
#文件预处理成tsv格式
def read_file(file_name):
    with open(file_name,'r',encoding='utf-8') as f:
        data = f.readlines()
    label_map = {}
    label_map["ham"] = 0
    label_map["spam"] = 1
    
    
    text = list()
    label = list()
    label_num = list()
    for item in data:
        item = item.strip()
        item = item.split(" ")
        item_len = len(item)
        tmp_list = list()
        if(item[1] in ["ham","spam"]):
            label.append(item[1])
            label_num.append(label_map[item[1]])
            for index in range(2,item_len,2):
                tmp_list.append(item[index])
        text.append(tmp_list)
    return text,label,label_num


train_text,train_label,train_label_num = read_file("../data/train")
test_text,test_label,test_label_num = read_file("../data/test")


train_text = [" ".join(item) for item in train_text]
test_text = [" ".join(item) for item in test_text]



cv = CountVectorizer()
part_fit = cv.fit(train_text) # 以部分句子为参考
train_all_count = cv.transform(train_text) # 对训练集所有邮件统计单词个数
test_all_count = cv.transform(test_text) # 对测试集所有邮件统计单词个数
tfidf = TfidfTransformer()
train_tfidf_matrix = tfidf.fit_transform(train_all_count)
test_tfidf_matrix = tfidf.fit_transform(test_all_count)



print('训练集', train_tfidf_matrix.shape)
print('测试集', test_tfidf_matrix.shape)

#训练数据
train_tfidf_matrix_1 = train_tfidf_matrix.toarray()
#测试数据
test_tfidf_matrix1 = test_tfidf_matrix.toarray()


训练集 (9000, 965)
测试集 (1000, 965)


In [114]:
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import StandardScaler


# PCA
pca = PCA(n_components=0.5)# 保证降维后的数据保持90%的信息
pca.fit(train_tfidf_matrix_1)
pca_train_tfidf_matrix_1 = pca.transform(train_tfidf_matrix_1)
pca_train_tfidf_matrix_1.shape
pca_test_tfidf_matrix_1 = pca.transform(test_tfidf_matrix1)
print(pca_train_tfidf_matrix_1.shape)

(9000, 113)



### 分析：
0. 处理数据排序
1. 建立决策树
2. 根据决策树进行预测


In [212]:
#预处理数据 空间换时间
#输入处理好的训练数据，输出是每个特征均进行排序，
#且train_label变成特征相同的列数
def sub_tree(train_data,train_label,segment_x):
    return train_data[:segment_x+1],train_label[:segment_x+1],train_data[segment_x+1:],train_label[segment_x+1:]
#获取信息增益比i是列，j是行
def get_ratio(i,j,train_data,train_label):
    m = train_data.shape[0]
    s_one = train_label[:j+1].sum()
    b_one = train_label[j+1:].sum()
    a = (j+1)/m*(1-pow(s_one/(j+1),2)*pow((j+1-s_one)/(j+1),2))
    b = (m-j-1)/m*(1-pow(b_one/(m-j-1),2)*pow((m-j-1-b_one)/(m-j-1),2))
    return a+b
    
      
    

'''
    输入：训练数据
    输出：对应信息增益比最大的特征和特征中的分割点
'''
def calculate_f_s_i(train_data,train_label):
    max_ratio = 100
    max_feature = 0
    segment_point = 0
    segment_x = 0
    tmp_data = train_data
    tmp_label = train_label
    n = train_data.shape[1]
    m = train_data.shape[0]
    for i in (range(n)):
        #按照列排序
        index = np.argsort(train_data[:,i])
        train_label = np.array([train_label[inx] for inx in index])
        train_data = np.array([train_data[inx] for inx in index])
        for j in range(m-1):
            ratio = get_ratio(i,j,train_data,train_label)
            if(ratio<max_ratio):
                max_ratio = ratio
                max_feature = i
                segment_point = (train_data[j][i]+train_data[j+1][i])/2
                segment_x = j
                tmp_data = train_data
                tmp_label = train_label
    return max_feature,segment_point,max_ratio,segment_x,tmp_data,tmp_label

def Major_class(train_label):
    s = train_label.sum()
    if(s>len(train_label)/2):
        return 1
    return 0

'''
    输入训练数据和标签，递归返回一个树的结构，每次均是二叉树，连续数据的中间分类
    tree[分割特征][分割浮点数][小于浮点数 or 大于浮点数]
    1. 获取信息增益比最大的特征和分割点
    2. 判断是否可以结束程序
    3. 根据分割的特征和分割点进入下一层树，之前需要对数据进行分割
'''
def create_tree(train_data,train_label,depth):
    #获取信息增益比最大的特征、分割点和对应的信息增益比
    feature,segment_point,inf_gain_ratio,segment_x,tmp_data,tmp_label = calculate_f_s_i(train_data,train_label)
    #结束条件
    train_label_dict = {i for i in train_label}
    if(len(train_label_dict)==1):
        return train_label[0]
    #深度剪枝
    if(depth>=16):
        return Major_class(train_label)
    #广度剪枝
#     if(train_data.shape[0]<5):
#         return Major_class(train_label)
    
    #基尼系数剪枝
    #进入下一层，需要定义一个分割函数
    result = {feature:{segment_point:{}}}
    a,b,c,d = sub_tree(tmp_data,tmp_label,segment_x)
    result[feature][segment_point][0] = create_tree(a,b,depth+1)
    result[feature][segment_point][1] = create_tree(c,d,depth+1)
    return result

#测试数据
# result = {feature:{segment_point:{}}}
def get_test_result(test_data,tree):
    m,n = test_data.shape[0],test_data.shape[1]
    result_list = list()
    tmp = tree
    for i in range(m):
        single_test = test_data[i]
        
        while True:
            if(type(tree).__name__ != 'dict'):
                result_list.append(tree)
                break
            (key, value), = tree.items()
            dataVal = single_test[key]
            (k,v), = value.items()
            if(dataVal<k):
                tree = v[0]
            else:
                tree = v[1]
        tree = tmp
    return result_list

0.889

In [246]:
result_list = list()
for i in range(200):
    split_num = 40
    pca_train_tfidf_matrix_2 = pca_train_tfidf_matrix_1[i*split_num:(i+1)*split_num]
    train_label_num2 = train_label_num[i*split_num:(i+1)*split_num]
    result = create_tree(pca_train_tfidf_matrix_2,train_label_num2,0)
    test_result = get_test_result(pca_test_tfidf_matrix_1,result)
    print(accuracy_score(test_result,test_label_num))
    if(test_result>0.88):
        result_list.append(test_result)
    

0.889
0.823
0.801
0.851
0.844
0.838
0.846
0.843
0.835
0.831
0.849
0.843
0.818
0.824
0.845
0.808
0.848
0.795
0.607
0.814


In [271]:
result_list_np = np.array(result_list)
result_list_n = result_list_np.sum(axis=0)
r = [int(item>=19) for item in result_list_n]
print(accuracy_score(r,test_label_num))

0.817


0.817
