## 0.导入库

### 0.1 常用的库

In [1]:
import os
import numpy as np
import time

### 0.2 需要使用的库

In [2]:
import pandas as pd
import pickle
import re

### 0.3 基本方法

In [3]:
import sys
# 实时更新进度条
def print_flush(print_string):
    print(print_string, end='\r')
    sys.stdout.flush()

# 导入深度学习库tensorflow    
import tensorflow as tf    
#  获取显存动态增长的会话 
def get_session():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

  from ._conv import register_converters as _register_converters


## 1. 文件内容的列表

### 1.1 文本文件路径的列表

In [5]:
def get_txtFilePath_list(root_dirPath):
    txtFilePath_list = []
    sub_dirName_list = next(os.walk(root_dirPath))[1]
    for sub_dirName in sub_dirName_list:
        sub_dirPath = os.path.join(root_dirPath, sub_dirName)
        fileName_list = next(os.walk(sub_dirPath))[2]
        filePath_list = [os.path.join(sub_dirPath, k) for k in fileName_list]
        txtFilePath_list.extend(filePath_list)
    return txtFilePath_list


root_dirPath = '../resources/THUCNews/'
txtFilePath_list = get_txtFilePath_list(root_dirPath)
print('文本文件路径的列表长度:', len(txtFilePath_list))

文本文件路径的列表长度: 836075


### 1.2 读取所有文本文件

In [None]:
def get_fileContent(txtFilePath):
    with open(txtFilePath, 'r', encoding='utf8') as file:
        fileContent = file.read()
    return fileContent


sequence_length = 600
sample_quantity = len(txtFilePath_list)
startTime = time.time()
content_list = []
for i in range(sample_quantity):
    txtFilePath = txtFilePath_list[i]
    fileContent = get_fileContent(txtFilePath)
    fileContent_1 = re.sub('\s+', ' ', fileContent)
    fileContent_2 = fileContent_1[:sequence_length]
    content_list.append(fileContent_2)
    # 打印提示信息，动态刷新进度条
    index = i + 1
    if index % 100 == 0 or index==sample_quantity:
        percent = index / sample_quantity * 100
        percent_int = int(percent)
        half_percent_int = int(percent_int / 2)
        string_0 = '%d/ %d ' %(index, sample_quantity)
        string_1 = '>' * half_percent_int + ' ' * (50-half_percent_int)
        string_2 = ' 进度百分比:%.2f%%' %percent
        usedTime = time.time() - startTime
        string_3 = ' 读取速度:%.2f文件/秒' %(index/usedTime)
        string_4 = ' 总共花费时间:%.2f秒' %(usedTime)
        print_string = string_0 + string_1 + string_2 + string_3 + string_4
        print_flush(print_string)

836075/ 836075 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 进度百分比:100.00% 读取速度:2151.31文件/秒 总共花费时间:388.64秒


### 1.3 把文件内容列表保存为pickle文件

In [7]:
pickleFilePath = '../resources/content_list.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(content_list, file)

### 1.4 从pickle文件加载文件内容列表

In [4]:
pickleFilePath = '../resources/content_list.pickle'
with open(pickleFilePath, 'rb') as file:
    content_list = pickle.load(file)

## 2. 样本标签的列表

### 2.1 获取样本标签的列表

In [5]:
def get_label_list(root_dirPath):
    label_list = []
    sub_dirName_list = next(os.walk(root_dirPath))[1]
    for sub_dirName in sub_dirName_list:
        sub_dirPath = os.path.join(root_dirPath, sub_dirName)
        fileName_list = next(os.walk(sub_dirPath))[2]
        part_label_list = [sub_dirName] * len(fileName_list)
        label_list.extend(part_label_list)
    return label_list


root_dirPath = '../resources/THUCNews/'
label_list = get_label_list(root_dirPath)
print('样本标签的列表长度:', len(label_list)) 
pd.value_counts(label_list)

样本标签的列表长度: 836075


科技    162929
股票    154398
体育    131604
娱乐     92632
时政     63086
社会     50849
教育     41936
财经     37098
家居     32586
游戏     24373
房产     20050
时尚     13368
彩票      7588
星座      3578
dtype: int64

## 3. 字列表

### 3.1 根据文件内容列表，统计计数获得出现次数排名前6999的字
#### 排名7000以后的字统一用'PAD'表示

In [13]:
from collections import Counter

def get_word_list(content_list, size):
    startTime = time.time()
    counter = Counter()
    sample_quantity = len(content_list)
    for i, content in enumerate(content_list, 1):
        counter.update(content)
        if i%1000==0 or i==sample_quantity:
            string_0 = '%d/ %d' %(i, sample_quantity)
            string_1 = ' 进度百分比: %.2f%%' %(i/sample_quantity*100)
            usedTime = time.time() - startTime
            string_2 = ' 花费时间: %.2f秒' %usedTime
            print_string = string_0 + string_1 + string_2
            print_flush(print_string)
    word_list_1 = [k[0] for k in counter.most_common(size-1)]
    word_list = ['PAD'] + word_list_1
    return word_list


vocabulary_size = 7000
word_list = get_word_list(content_list, vocabulary_size)

836075/ 836075 进度百分比: 100.00% 花费时间: 79.08秒

### 3.2 把字列表保存为pickle文件

In [14]:
pickleFilePath = '../resources/word_list.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(word_list, file)

### 3.3 从pickle文件加载字列表

In [6]:
pickleFilePath = '../resources/word_list.pickle'
with open(pickleFilePath, 'rb') as file:
    word_list = pickle.load(file)

## 4.数据准备

### 4.1 get_X

In [7]:
import keras

# sequence_length中文叫做序列长度，根据自己经验设置
# sequence_length设置为600，即根据文章的前600字则可判断文章类型
sequence_length = 600
word2id_dict = dict([(b, a) for a, b in enumerate(word_list)])
    
    
# 获取一篇文章对应的字id列表    
def get_id_list(index):
    content = index if isinstance(index, str) else content_list[index]
    id_list = []
    for word in content[:sequence_length]:
        if word in word2id_dict:
            id_ = word2id_dict[word]
            id_list.append(id_)
        else:
            id_list.append(0)
    return id_list       


# 获取多篇文章的字id列表
def get_X(indexes):
    idList_list = [get_id_list(k) for k in indexes]
    X = keras.preprocessing.sequence.pad_sequences(idList_list, sequence_length)        
    return X

Using TensorFlow backend.


### 4.2 get_Y

In [8]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
labelEncoder.fit(label_list)
category_quantity = labelEncoder.classes_.shape[0]


# 获取多篇文章标签one-hot编码矩阵
def get_Y(indexes):
    part_label_list = [label_list[k] for k in indexes]
    oneHot_2d_array = labelEncoder.transform(part_label_list)
    Y = keras.utils.to_categorical(oneHot_2d_array, category_quantity)
    return Y

### 4.3 使用带权重的抽样策略，计算每个样本的权重

In [9]:
def get_probability_list(label_list):
    count_series = pd.value_counts(label_list)
    category_quantity = len(count_series)
    category_weights = 1 / category_quantity
    label2weights_dict = dict([(a, b) for a, b in zip(count_series.index, category_weights/count_series)])
    probability_list = [label2weights_dict[k] for k in label_list]
    return probability_list

### 4.4 批量数据生产者线程

In [10]:
import threading
from sklearn.model_selection import train_test_split

sample_quantity = len(label_list)
index_1d_array = np.arange(sample_quantity)
train_index_1d_array, test_index_1d_array = train_test_split(index_1d_array, random_state=2019)
train_label_list = [label_list[k] for k in train_index_1d_array]
train_probability_list = get_probability_list(train_label_list)
batch_size = 128


class BatchDataThread(threading.Thread):
    def __init__(self, queue):
        super(BatchDataThread, self).__init__()
        self.queue = queue
        self.start()
    
    def run(self):
        while not self._is_stopped:
            if self.queue.qsize() < 3:
                selected_indexes = np.random.choice(
                    train_index_1d_array, size=batch_size, p=train_probability_list)
                batch_X = get_X(selected_indexes)
                batch_Y = get_Y(selected_indexes)
                put_tuple = batch_X.astype('int32'), batch_Y.astype('float32')
                self.queue.put(put_tuple)
            time.sleep(0.0001)            

### 4.5 批量数据生成器类

In [11]:
import queue

class BatchDataGenerator(object):
    def __init__(self, worker_quantity=4):
        self.queue = queue.Queue()
        for i in range(worker_quantity):
            BatchDataThread(self.queue)
            
    def __iter__(self):
        return self
    
    def __next__(self):
        batch_data = self.queue.get()
        return batch_data
    
    
batchData_generator = BatchDataGenerator()    

## 5.搭建神经网络

In [12]:
tf.reset_default_graph()
X_holder = tf.placeholder(tf.int32, [None, sequence_length])
Y_holder = tf.placeholder(tf.float32, [None, category_quantity])
data_0 = X_holder
vocabulary_size = 7000
embedding_size = 100
layer_1 = tf.get_variable('embedding', [vocabulary_size, embedding_size])
data_1 = tf.nn.embedding_lookup(layer_1, data_0)
filter_quantiy = 128
layer_2 = tf.layers.conv1d
data_2 = layer_2(data_1, filter_quantiy, 3, padding='SAME')
layer_3 = tf.layers.conv1d
data_3 = layer_3(data_1, filter_quantiy, 5, padding='SAME')
layer_4 = tf.layers.conv1d
data_4 = layer_4(data_1, filter_quantiy, 7, padding='SAME')
layer_5 = tf.concat
data_5 = layer_5([data_2, data_3, data_4], axis=2)
layer_6 = tf.reduce_max
data_6 = layer_6(data_5, [1])
layer_7 = tf.layers.dense
fc1_units = 128
data_7 = layer_7(data_6, fc1_units)
layer_8 = tf.nn.relu
data_8 = layer_8(data_7)
layer_9 = tf.layers.dense
data_9 = layer_9(data_8, category_quantity)
layer_10 = tf.nn.softmax
data_10 = layer_10(data_9)
layer_11 = tf.nn.softmax_cross_entropy_with_logits_v2
data_11 = layer_11(labels=Y_holder, logits=data_9)
loss = tf.reduce_mean(data_11)
learning_rate = 5e-4
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
isCorrect = tf.equal(tf.argmax(Y_holder, 1), tf.argmax(data_10, 1))
accuracy = tf.reduce_mean(tf.cast(isCorrect, tf.float32))

W0827 18:58:50.795184 13400 deprecation.py:506] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0827 18:58:50.811180 13400 deprecation.py:323] From <ipython-input-12-9d557bfe98ff>:11: conv1d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
W0827 18:58:51.027110 13400 deprecation.py:323] From <ipython-input-12-9d557bfe98ff>:22: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.


## 6.参数初始化

In [14]:
init = tf.global_variables_initializer()
session = get_session()
session.run(init)

## 7.模型训练

In [15]:
train_steps = 1000
startTime = time.time()
for step in range(1, train_steps+1):
    batch_X, batch_Y = next(batchData_generator)
    session.run(train, {X_holder:batch_X, Y_holder:batch_Y})
    if step % 2 == 0 :
        loss_value, accuracy_value = session.run([loss, accuracy], {X_holder:batch_X, Y_holder:batch_Y})
        usedTime = time.time() - startTime
        speed = step / usedTime
        print_string = '步数:%d 损失值:%.4f 准确率:%.4f 训练速度:%.2f步/秒' %(
            step, loss_value, accuracy_value, speed)
        print_flush(print_string)

步数:1000 损失值:0.2639 准确率:0.9141 训练速度:10.21步/秒

## 8.模型测试

In [18]:
import warnings
warnings.filterwarnings("ignore")

def predict(input_content):
    id_list = get_id_list(input_content)
    X = keras.preprocessing.sequence.pad_sequences([id_list], sequence_length)    
    Y = session.run(data_10, {X_holder:X})
    y = np.argmax(Y, axis=1)
    label = labelEncoder.inverse_transform(y)[0]
    return label

selected_index = np.random.choice(test_index_1d_array, 1)[0]
selected_content = content_list[selected_index]
true_label = label_list[selected_index]
predict_label = predict(selected_content)
print('选出文本内容为: ', selected_content)
print('真实标签: ', true_label)
print('预测标签: ', predict_label, '\n')
print('对于任意文本做分类预测，例如:')
input_content = "足球篮球"
print('predict("%s") :' %input_content, predict(input_content))

选出文本内容为:  美国大学生有多牛：“我永远是最好的” 美国的大学生无论考分如何低，都觉得自己是最好的；无论观点如何幼稚，都觉得自己是最重要的。大多数美国大学生在课堂上都自我感觉良好，特把自己当根葱。他们在班上当众发言可以侃侃而谈，面对校外的公众演讲也同样从容不迫。美国教育中似乎的确有一个魔幻环节，让学生们年纪轻轻，就怀有这样的自信和从容。 在我看来，美国教育的最亮点在于：它可以把一个学生的数理化都教得很差很差，却让这个学生相信自己画的画很像毕加索的。这个系统完全就是有预谋地忽略学生的缺点，同时拼命发掘和培养学生的优点。因此很多机关巧妙的鼓励教育被发明出来，但是却很少有像样的严厉的惩罚教育。从小到大，学生从家长和老师那里听到的都是：你是最棒的！(即使不是最棒的)，你也是最独特的(这句话非常保险。就是运用最精密的科学计算，这句话也一点都不错)！所以在美国人心里，“与众不同”是一个很有腔势的褒义词。美国大学生做什么都显得自信满满，因为做的好做得差都十分独特，十分“与众不同。” 学生们毕业的时候，甚至有些在还没有大学毕业的时候，就大体能了解到自己擅长和喜欢做的事。大学的四年里，他们可以选不同专业的课、做不同的课外工作、通过实习来尝试到底什么最适合自己。大学教育的成功并不在于灌输了多少知识，而在于让学生有足够的机会了解和发现自己，并且建立更多的自信。学生选择专业与工作的一个十分重要的标准是看自己是否有兴趣。
真实标签:  教育
预测标签:  教育 

对于任意文本做分类预测，例如:
predict("足球篮球") : 体育


## 9.混淆矩阵

In [19]:
from sklearn.metrics import confusion_matrix

def predict_test():
    startTime = time.time()
    test_sample_quantity = len(test_index_1d_array)
    batch_size  = 100
    predict_Y_list = []
    for i in range(0, test_sample_quantity, batch_size):
        part_index_1d_array = test_index_1d_array[i: i + batch_size]
        part_content_list = [content_list[k] for k in part_index_1d_array]
        batch_X = get_X(part_index_1d_array)
        predict_Y = session.run(data_10, {X_holder:batch_X})
        predict_Y_list.extend(predict_Y)
        usedTime = time.time() - startTime
        print_string = '%d/ %d 花费时间:%.2f秒' %(i, test_sample_quantity, usedTime)
        print_flush(print_string)
    print_string = '%d/ %d 花费时间:%.2f秒' %(test_sample_quantity, test_sample_quantity, usedTime)
    print_flush(print_string)    
    Y = np.array(predict_Y_list)   
    y = np.argmax(Y, axis=1)
    predict_label_list = labelEncoder.inverse_transform(y)
    return predict_label_list


test_label_list = [label_list[k] for k in test_index_1d_array]
predict_label_list = predict_test()
pd.DataFrame(confusion_matrix(test_label_list, predict_label_list), 
             columns=labelEncoder.classes_,
             index=labelEncoder.classes_ )

209019/ 209019 花费时间:68.53秒

Unnamed: 0,体育,娱乐,家居,彩票,房产,教育,时尚,时政,星座,游戏,社会,科技,股票,财经
体育,32403,132,20,110,26,35,24,84,3,24,92,25,10,0
娱乐,109,21784,93,4,64,89,261,156,23,42,378,92,6,11
家居,18,78,7780,4,75,32,137,39,7,11,40,59,32,13
彩票,33,2,1,1877,1,0,0,4,1,3,13,2,0,0
房产,0,7,29,6,4780,13,12,19,0,3,30,8,57,14
教育,11,22,18,5,16,9943,41,109,11,12,238,28,12,8
时尚,0,11,22,2,0,4,3225,5,1,5,6,5,0,0
时政,61,69,20,5,84,175,39,14794,0,11,246,87,125,31
星座,1,0,0,0,0,1,1,0,905,0,1,0,0,0
游戏,8,10,11,2,4,14,30,6,1,5773,17,158,15,0


## 10.报告表

In [20]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def eval_model(y_true, y_pred, labels):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 计算总体的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label_list, predict_label_list, labelEncoder.classes_)

Unnamed: 0,Label,Precision,Recall,F1,Support
0,体育,0.987896,0.982266,0.985073,32988
1,娱乐,0.962531,0.942541,0.952431,23112
2,家居,0.911861,0.934535,0.923059,8325
3,彩票,0.886213,0.969024,0.925771,1937
4,房产,0.811821,0.960225,0.879809,4978
5,教育,0.909532,0.949303,0.928992,10474
6,时尚,0.804239,0.981436,0.884046,3286
7,时政,0.860316,0.939481,0.898157,15747
8,星座,0.947644,0.9956,0.97103,909
9,游戏,0.853868,0.954373,0.901327,6049


## 11.模型保存

In [17]:
saver = tf.train.Saver()
ckptFilePath = '../resources/trained_model/textCnn.ckpt'
saver.save(session, ckptFilePath)

'../resources/trained_model/textCnn.ckpt'

## 12.模型加载

In [16]:
saver = tf.train.Saver()
ckptFilePath = '../resources/trained_model/textCnn.ckpt'
saver.restore(session, ckptFilePath)