In [None]:
# 回忆使用tensorflow构建神经网络的步骤：
# 构建计算图---LSTM模型:
#     embedding 层
#     LSTM 层
#     fc 全连接层
#     train_op
# 训练流程代码：
# 数据集封装：
#     api: next_batch(batch_size) 
# 词表封装：
#     api: sentence2id(text_sentence): 句子转id
# 类别的封装：
#    api:  category2id(text_category)

import tensorflow as tf
import os
import sys
import numpy as np
import math

# 设置日志
tf.logging.set_verbosity(tf.logging.INFO)

In [None]:
def get_default_params():
    """get default parameters 原来直接定义成全局变量，这样封装，方便管理"""
    # 如果想得到更好的结果，可以调大下面的值，考虑到笔记本中运行
    return tf.contrib.training.HParams(
        num_embedding_size = 16,
        num_timesteps = 50, # 每个batch是定长的
        num_lstm_nodes = [32,32],
        num_lstm_layers = 2,
        num_fc_nodes = 32,
        batch_size = 100,
        clip_lstm_grads = 1.0, # 用来控制lstm梯度大小， 用来防止梯度爆炸, 如果超过某一上限，则设定为1.0
        learning_rate = 0.001,
        num_word_threadhold = 10, # 用来选择只有词频超过10才纳入进来
    )

hps = get_default_params()  # 参数值可以通过hps.xxx来访问
train_file = './cnews_data/cnews.train.seg.txt'
val_file = './cnews_data/cnews.val.seg.txt'
test_file = './cnews_data/cnews.test.seg.txt'
vocab_file = './cnews_data/cnews.vocab.txt'
category_file = './cnews_data/cnews.category.txt'
output_dir = './cnews_data/run_text_rnn'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
class Vocab:
    def __init__(self, file_name, num_word_threadhold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threadhold = num_word_threadhold
        self._read_dict(file_name)
        
    def _read_dict(self, file_name):
        with open(file_name, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threadhold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
            
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
            
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
        
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids
    
# 测试代码    
vocab = Vocab(vocab_file, hps.num_word_threadhold)
vocab_size = vocab.size()
tf.logging.info('vocab size: %d' % vocab_size)

# test_str = '和 也 有 我'
# print(vocab.sentence_to_id(test_str))

# 打印结果：
# INFO:tensorflow:vocab size: 77323
# [10, 12, 13, 18]

# 类别封装
class CategoryDict:
    def __init__(self, category_file):
        self._category_to_id = {}
        with open(category_file, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
        
    def size(self):
        return len(self._category_to_id)
        
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Exception('%s not in our category list.' % category)
        return self._category_to_id[category]

category_vocab = CategoryDict(category_file)
category_name = '时尚'
num_classes = category_vocab.size()
tf.logging.info('id of %s is %d' % (category_name, category_vocab.category_to_id(category_name)))
tf.logging.info('num_classes: %d' % category_vocab.size())

In [None]:
# 封装数据集
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0 # 当前读取到的位置
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info('Loading data from %s' % filename)
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content) # 它是一个列表
            # 截取单词
            id_words = id_words[0:self._num_timesteps]
            # 对于不足num_timesteps的要补齐
            num_padding = self._num_timesteps - len(id_words)
            id_words = id_words + [self._vocab.unk for i in range(num_padding)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
            
        # 将输入转换成矩阵
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
        # 随机化，让inputs, outputs同步随机化
        self._random_shuffle()
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
#         tf.logging.info('the type of p is %s' % type(p))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
    
    def next_batch(self, batch_size):
        """
        api: 获取下一个批处理块
        - batch_size: 批处理块尺寸
        """
        end_indicator = self._indicator + batch_size
        # 表示已经到结尾了
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        # 如果还比输入大，则抛出异常，批处理块尺寸太大了
        if end_indicator > len(self._inputs):
            raise Exception('batch size: %d is too large. ' % batch_size)
        
        batch_inputs = self._inputs[self._indicator : end_indicator]
        batch_outputs = self._outputs[self._indicator : end_indicator]
        
        return batch_inputs, batch_outputs
    
train_dataset = TextDataSet(train_file, vocab, category_vocab, hps.num_timesteps)
val_dataset = TextDataSet(val_file, vocab, category_vocab, hps.num_timesteps)       
test_dataset = TextDataSet(test_file, vocab, category_vocab, hps.num_timesteps)

# inputs是一个2 x 50的矩阵(里面是词语的id)，outputs是一个向量（数组）(里面是类别的id)
print(train_dataset.next_batch(2))
print(val_dataset.next_batch(2))
print(test_dataset.next_batch(2))


In [None]:
# 构建计算图
def create_model(hps, vocab_size, num_classes):
    """
    args:
    - hps: 参数
    - vocab_size: 词表大小
    - num_classes: 类别大小
    """
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    
    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps)) # 矩阵(100 x 16)
    outputs = tf.placeholder(tf.int32, (batch_size, ))  #向量
    # 讲resnet时讲到，就表示我keep多少值，丢掉的就是1-keep的值
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    # 保存当前训练到多少步
    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)
    
    # embedding 层构建开始==================================================
    # 构建一个初始化函数
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0) # 在-1.0~1.0之间进行均匀初始化
    # tf.variable_scope跟name_scope不同在于：前者可以指定一个初始化器
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        """有词表个向量，每个向量都是embedding size的矩阵, 长：词表数目，宽：embedding size"""
        # tf.get_varitable 如果变量存在就取，否则新建此变量
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size, hps.num_embedding_size],
            tf.float32
        )
        # 将输入转换成embedding输入
        # [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)
    # embedding 层构建结束==================================================
      
    # lstm层构建开始========================================================
    # 输入层的大小 + 输出层的大小 再做开方的三分之一再被一除
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_initilizer = tf.random_normal_initializer(-scale, scale)  # 可以换成自己的initializer
    # 构建两层lstm   
    with tf.variable_scope('lstm_nn', initializer = lstm_initilizer):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = tf.contrib.rnn.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True)
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob)
            cells.append(cell)
        
        # 合并cell, 第一个cell的输出是第二个cell的输入
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        # 中间状态
        initial_state = cell.zero_state(batch_size, tf.float32)
        # rnn_outputs: 三维数组：[batch_size, num_timesteps, lstm_outpus[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell, embed_inputs, initial_state = initial_state)
        # last为lstm最后的输出
        last = rnn_outputs[:, -1, :]
    # lstm层构建结束======================================================== 
    
    # fc层构建开始==========================================================
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        fc1 = tf.keras.layers.Dense(
                    hps.num_fc_nodes,
                    activation = tf.nn.relu,
                    name = 'fc1')(last)
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        logits = tf.keras.layers.Dense(
                    num_classes,
                    name = 'fc2')(fc1_dropout)    
    # fc层构建结束==========================================================
    
    # 计算损失函数
    with tf.name_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                            logits = logits, labels = outputs)
        loss = tf.reduce_mean(softmax_loss)
        # 计算预测值 [0, 1, 5, 2, 4] -> argmax: 2 第2个位置上的值最大
        y_pred = tf.argmax(tf.nn.softmax(logits), 1, output_type = tf.int32)
        # 判断是否预测正确：bool
        correct_pred = tf.equal(outputs, y_pred)
        # 计算正确率 tf.cast将布尔转换成浮点型
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
    # 计算train_op
    with tf.name_scope('train_op'):
        tvars = tf.trainable_variables() # 获得所有可以训练的变量
        for var in tvars:
            tf.logging.info('variable name: %s' % var.name)
        # 对梯度做截断
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), hps.clip_lstm_grads)
        # 将截断后的梯度应用到所有变量上去
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(
                    zip(grads,tvars), global_step = global_step)
        
    return (
        (inputs, outputs, keep_prob), # 输入 placeholders
        (loss, accuracy), # 输出
        (train_op, global_step)
    )


placeholders, metrics, others = create_model(hps, vocab_size, num_classes)

inputs, outputs, keep_prob = placeholders
loss, accuracy = metrics
train_op, global_step = others



In [None]:
# 训练流程部分

init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8 # 
test_keep_prob_value = 1.0 # 做测试的时候不会去做dropout

# 定义train多少步
num_train_steps = 10000

# Train: 100%
# valid: 92.7%
# test: 93%
with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_dataset.next_batch(hps.batch_size)
        outputs_val = sess.run([loss, accuracy, train_op, global_step],
                          feed_dict = {
                              inputs: batch_inputs,
                              outputs: batch_labels,
                              keep_prob: train_keep_prob_value,
                          })
        loss_val, accuracy_val, _, global_step_val = outputs_val
        # 每100次打印
        if global_step_val % 100 == 0:
            tf.logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f' % (global_step_val, loss_val,accuracy_val))
        
        
        