## 全局参数

In [1]:
import os

#数据集目录
BASE_PATH = '/Users/hydeli/workspace/private/keep-ml/homework-1/data/data1/20_newsgroups/'

#分类文件夹
#DATA_FOLDERS = ['alt.atheism', 'misc.forsale']
DATA_FOLDERS = [name for name in os.listdir(BASE_PATH) if not os.path.isfile(BASE_PATH + name)]

#训练数据集样本数
TRAIN_DATA_LIMITATION = 500

#测试数据集样本数
TEST_DATA_LIMITATION = 100

#平滑因子
SMOOTHING = 0.0001


## 导入数据

把所有数据加载到一个两层的字典，第一层为 news_group_name，第二层为文件全路径和文件内容


In [2]:
import os

def open_and_read(path):
    with open(path, 'r', encoding='utf-8', errors='ignore') as file:
        data = file.read()
        return data
    
def load_ori_data():
    ori_datas = {}
    print ('加载样本文件 :')
    for folder_name in DATA_FOLDERS:
        folder_path = BASE_PATH + folder_name + '/'
        datas_in_group = {}
        for idx, file_name in enumerate(name for name in os.listdir(folder_path) if os.path.isfile(folder_path + name)):
            datas_in_group[file_name] = open_and_read(folder_path + file_name)
            if idx == TRAIN_DATA_LIMITATION+TEST_DATA_LIMITATION-1:
                break
        ori_datas[folder_name] = datas_in_group
        print ('从 [%s] 导入 %s 份样本' % (folder_name, len(datas_in_group)))
    return ori_datas

ori_datas = load_ori_data()

加载样本文件 :
从 [talk.politics.mideast] 导入 600 份样本
从 [rec.autos] 导入 600 份样本
从 [comp.sys.mac.hardware] 导入 600 份样本
从 [alt.atheism] 导入 600 份样本
从 [rec.sport.baseball] 导入 600 份样本
从 [comp.os.ms-windows.misc] 导入 600 份样本
从 [rec.sport.hockey] 导入 600 份样本
从 [sci.crypt] 导入 600 份样本
从 [sci.med] 导入 600 份样本
从 [talk.politics.misc] 导入 600 份样本
从 [rec.motorcycles] 导入 600 份样本
从 [comp.windows.x] 导入 600 份样本
从 [comp.graphics] 导入 600 份样本
从 [comp.sys.ibm.pc.hardware] 导入 600 份样本
从 [sci.electronics] 导入 600 份样本
从 [talk.politics.guns] 导入 600 份样本
从 [sci.space] 导入 600 份样本
从 [soc.religion.christian] 导入 600 份样本
从 [misc.forsale] 导入 600 份样本
从 [talk.religion.misc] 导入 600 份样本


## 样本划分
将数据随机划分为训练集和测试集
每个数据集包含样本数由全局参数 TRAIN_DATA_LIMITATION 和 TEST_DATA_LIMITATION 定义。

In [3]:
import copy
import random

def random_split_data_set(ori_datas):
    train_datas = {}
    test_datas = {}
    for (group, datas_in_group) in ori_datas.items():
        copies = copy.deepcopy(datas_in_group)
        
        train_subset = {}
        while len(train_subset) < TRAIN_DATA_LIMITATION:
            doc_name, txt = random.choice(list(copies.items()))
            train_subset[doc_name] = txt
            copies.pop(doc_name)
        train_datas[group] = train_subset
        
        test_subset = {}
        while len(test_subset) < TEST_DATA_LIMITATION:
            doc_name, txt = random.choice(list(copies.items()))
            test_subset[doc_name] = txt
            copies.pop(doc_name)
        test_datas[group] = test_subset
        
    return train_datas, test_datas

train_datas, test_datas = random_split_data_set(ori_datas)


## 数据预处理
去掉标点符号，转为小写，分词后提取词干，过滤停用词

In [4]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def word_tokenize(txt):
    #将标点符号转置为空格
    translator = str.maketrans(string.punctuation, ' ' * (len(string.punctuation)))
    txt = txt.translate(translator)
    
    #大写转小写
    txt = txt.lower()
    
    #分词
    tokens = nltk.tokenize.word_tokenize(txt)

    sw_set = set(stopwords.words('english'))
    tokenized = []
    for token in tokens:
        #词干提取
        token = PorterStemmer().stem(token)
        #过滤停用词
        if token not in sw_set and not token.isdigit():
            tokenized.append(token)
    return tokenized


tokenized_datas = {}
for (group, datas_in_group) in train_datas.items():
    tokenized_in_group = {}
    for (file_name, txt) in datas_in_group.items():
        tokenized_in_group[file_name] = word_tokenize(txt)
    tokenized_datas[group] = tokenized_in_group

## 计算 tf-idf

tf：词频
idf：逆文档频率
tf-idf：tf * idf 
tf-idf 是重要性调整系数，衡量一个词是不是常见词。如果某个词比较少见，但是它在这篇文章中多次出现，那么它很可能就反映了这篇文章的特性，正是我们所需要的关键词。

In [5]:
import nltk
from math import log


def freq(tokens):
    return nltk.FreqDist(tokens)

def cal_tf_idf(tokenized_datas):

    #计算 tf 和 idf
    tf = {}
    idf = {}
    freq_in_all={}
    file_count = 0
    for (group, datas_in_group) in tokenized_datas.items():
        freq_in_group = nltk.FreqDist()
        for (file_name, tokens) in datas_in_group.items():
            freq_in_file = freq(tokens)
            
            # tf
            freq_in_group = freq_in_group + freq_in_file
            
            #idf
            file_count += 1
            for token in freq_in_file.keys():
               if freq_in_all.get(token) is None:
                   freq_in_all[token] = 1
               else:
                   freq_in_all[token] += 1
                    
        tf[group] = freq_in_group
        
    for (token, count) in freq_in_all.items():
        idf[token] = log(file_count/count+1)
    
    #计算tf-idf
    for (group, tf_in_group) in tf.items():
        for (token, count) in tf_in_group.items():
            tf_in_group[token] = count * idf[token]
    return tf

tf_idf = cal_tf_idf(tokenized_datas)



## 计算朴素贝叶斯的 likelihood

In [6]:
def likelihood(new_tf, group_tf_idf):
    word_count_in_group = group_tf_idf.N()
    lh = 0
    for token in new_tf.keys():
        word_tf_idf = 0 if group_tf_idf.get(token) is None else group_tf_idf.get(token)
        power = new_tf[token]
        lh += log((word_tf_idf + SMOOTHING) / word_count_in_group) * power
    return lh

def max_likelihood(txt, tf_idf):
    tp_list = []
    new_tf = freq(word_tokenize(txt))
    for (group, group_tf_idf) in tf_idf.items():
        tp_list.append((group, likelihood(new_tf, tf_idf[group])))
    max_tp = max(tp_list, key=lambda item: item[1])
    return max_tp

## 测试
验证对测试集的分类准确率

In [7]:

def acc(test_datas, tf_idf):
    total = 0
    error = 0
    tag_stat = []
    for (group, datas_in_group) in test_datas.items():
        tag_total = 0
        tag_error = 0
        for (key, txt) in datas_in_group.items():
            max_tp = max_likelihood(txt, tf_idf)
            total += 1
            tag_total += 1
            if (group != max_tp[0]):
                error += 1
                tag_error += 1
        tag_stat.append((group, tag_error, tag_total, tag_error / tag_total))
        print('tag: %s, error: %s, total:%s, error_rate:%s' % (group, tag_error, tag_total, tag_error / tag_total))
    print('error: %s, total:%s, error_rate:%s' % (error, total, error / total))

acc(test_datas, tf_idf)

tag: talk.politics.mideast, error: 5, total:100, error_rate:0.05
tag: rec.autos, error: 4, total:100, error_rate:0.04
tag: comp.sys.mac.hardware, error: 11, total:100, error_rate:0.11
tag: alt.atheism, error: 15, total:100, error_rate:0.15
tag: rec.sport.baseball, error: 5, total:100, error_rate:0.05
tag: comp.os.ms-windows.misc, error: 44, total:100, error_rate:0.44
tag: rec.sport.hockey, error: 5, total:100, error_rate:0.05
tag: sci.crypt, error: 5, total:100, error_rate:0.05
tag: sci.med, error: 3, total:100, error_rate:0.03
tag: talk.politics.misc, error: 25, total:100, error_rate:0.25
tag: rec.motorcycles, error: 7, total:100, error_rate:0.07
tag: comp.windows.x, error: 14, total:100, error_rate:0.14
tag: comp.graphics, error: 28, total:100, error_rate:0.28
tag: comp.sys.ibm.pc.hardware, error: 21, total:100, error_rate:0.21
tag: sci.electronics, error: 14, total:100, error_rate:0.14
tag: talk.politics.guns, error: 9, total:100, error_rate:0.09
tag: sci.space, error: 6, total:100,