In [None]:
# 构建计算图
#     embedding
#     LSTM
#     fc
#     train_op
# 训练流程代码
# 数据集封装
#     api: next_batch(batch_size)
# 词表封装:
#     api: sentence2id(text_sentence): 句子转换id
# 类别封装：
#     api:category2id(text_category).

In [1]:
import os
import tensorflow as tf
import sys
import numpy as np
import math

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
def get_default_params():
    return tf.contrib.training.HParams(
        num_embedding_size = 32,
        num_timesteps = 600,
        num_filters = 256,
        num_kernel_size = 3,
        num_fc_nodes = 64,
        batch_size = 100,
        learning_rate = 0.001,
        num_word_threshold = 10
    )

hps = get_default_params()

train_file = './cnews/cnews.trian.seg.txt'
val_file = './cnews/cnews.val.seg.txt'
test_file = './cnews/cnews.test.seg.txt'
vocab_file = './cnews/cnews.vocab.txt'
category_file = './cnews/cnews.category.txt'

output_file = './cnews/run_text_rnn'
if not os.path.exists(output_file):
    os.mkdir(output_file)


In [3]:
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
                       
    def _read_dict(self, filename):
        with open(filename, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
        
        for line  in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
            
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) \
                    for cur_word in sentence.split()]
        return word_ids
    

class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        self._read_dict(filename)
    
    def _read_dict(self, filename):
        with open(filename, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
            
        for line in lines:
            category = line.strip('\t\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
            
    def size(self):
        return len(self._category_to_id)

    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Exception("%s is not in our category list" % category)
        return self._category_to_id[category]
    
vocab = Vocab(vocab_file, hps.num_word_threshold)
tf.logging.info('vocab_size: %d' % vocab.size())
test_str = '的 stes 在'
tf.logging.info("label: " + str(list(test_str.split(" "))) + ", id: " + str(vocab.sentence_to_id(test_str)))
# print(test_str + str(vocab.sentence_to_id(test_str)))

category_vocab = CategoryDict(category_file)
tf.logging.info('num_classes: %d', category_vocab.size())
test_category = '时尚'
# print(category_vocab.category_to_id(test_category))
tf.logging.info("label: %s, id: %d" % (test_category, category_vocab.category_to_id(test_category)))

vocab_size = vocab.size()
num_classes = category_vocab.size()

INFO:tensorflow:vocab_size: 77323
INFO:tensorflow:label: ['的', 'stes', '在'], id: [2, 0, 4]
INFO:tensorflow:num_classes: 10
INFO:tensorflow:label: 时尚, id: 5


In [4]:
class TextDataSet:
    def __init__(self, filename, vocab, category_vacab, num_timesteps):
        self._vocab = vocab
        self._category_vacab = category_vacab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info("Loading data from %s" % filename)
        with open(filename, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
            
        for line in lines:
            label, content = line.strip('\t\r').split('\t')
            id_label = self._category_vacab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            # 统一mini_batch
            id_words = id_words[0: self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [
                self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        self._inputs = np.asarray(self._inputs, dtype = np.int32)
        self._outputs = np.asarray(self._outputs, dtype = np.int32)
        self._random_shuffle()
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
        
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception("batch_size: %d is too large" % batch_size)
        batch_inputs = self._inputs[self._indicator:end_indicator]
        batch_outputs = self._outputs[self._indicator:end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs
    
train_dataset = TextDataSet(
    train_file, vocab, category_vocab, hps.num_timesteps)
val_dataset = TextDataSet(
    val_file, vocab, category_vocab, hps.num_timesteps)
test_dataset = TextDataSet(
    test_file, vocab, category_vocab, hps.num_timesteps)

print(train_dataset.next_batch(2))
print(val_dataset.next_batch(2))
print(test_dataset.next_batch(2))

INFO:tensorflow:Loading data from ./cnews/cnews.trian.seg.txt
INFO:tensorflow:Loading data from ./cnews/cnews.val.seg.txt
INFO:tensorflow:Loading data from ./cnews/cnews.test.seg.txt
(array([[   0,   11, 5422, ...,    3, 3574,   18],
       [  15, 2275, 5490, ...,    0,    0,    0]]), array([3, 7]))
(array([[ 2537,  6258,  3906, ...,     0,     0,     0],
       [ 4353,  8040, 25982, ...,     0,     0,     0]]), array([3, 3]))
(array([[ 101,   23,  503, ...,    0,    0,    0],
       [  15, 5077,   72, ...,    0,    0,    0]]), array([3, 1]))


In [5]:
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    
    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    outputs = tf.placeholder(tf.int32, (batch_size, ))
    # 表示保留数，用于dropout
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    # 训练到哪一步保存下来
    global_step = tf.Variable(
        tf.zeros([], tf.int64), name = 'global_step', trainable = 'False')
    
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope(
        'embedding', initializer = embedding_initializer):
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size, hps.num_embedding_size],
            tf.float32)
        # [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)
    
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_filters) / 3.0
    cnn_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('cnn', initializer = cnn_init):
        # embed_inputs: [batch_size, num_timesteps, embed_size]
        # conv1d: [batch_size, num_timesteps, num_filters]
        conv1d = tf.layers.conv1d(
            embed_inputs, 
            hps.num_filters,
            hps.num_kernel_size,
            activation = tf.nn.relu)
        global_maxpooling = tf.reduce_max(conv1d, axis = [1])
    
    '''
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope(
        'lstm_nn', initializer = lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = tf.contrib.rnn.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True)
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        
        initial_state = cell.zero_state(batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, lstm_outputs[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(
            cell, embed_inputs, initial_state = initial_state)
        last = rnn_outputs[:, -1, :]
    '''
        
    fc_init = tf.uniform_unit_scaling_initializer(factor = 1.0)
    with tf.variable_scope(
        'fc', initializer = fc_init):
        fc1 = tf.layers.dense(global_maxpooling, 
                              hps.num_fc_nodes, 
                              activation = tf.nn.relu, 
                              name = 'fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        logits = tf.layers.dense(fc1_dropout,
                                 num_classes, 
                                 name = 'fc2')
    
    with tf.name_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = logits, labels = outputs)
        loss = tf.reduce_mean(softmax_loss)
        # [0, 1, 5, 4, 2] -> argmax: 2
        y_pred = tf.argmax(tf.nn.softmax(logits), 
                           1,
                           output_type = tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
    with tf.name_scope('train_op'):
        train_op = tf.train.AdamOptimizer(hps.learning_rate).minimize(loss, global_step = global_step)
        '''
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info("variable name: %s", var.name)
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads)
        optimizer = tf.train.AdamOptimizer(learning_rate = hps.learning_rate)
        train_op = optimizer.apply_gradients(
            zip(grads, tvars), global_step = global_step)
        '''
        
    return ((inputs, outputs, keep_prob), 
            (loss, accuracy),
            (train_op, global_step))

placeholders, metrics, others = create_model(
    hps, vocab_size, num_classes)

inputs, outputs, keep_prob = placeholders
loss, accuracy = metrics
train_op, global_step = others

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.


In [7]:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8
test_keep_prob_value = 1.0

num_train_steps = 10000

# Train: 100
# Valid: 95.7
# Test: 95.3
with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_dataset.next_batch(
            hps.batch_size)
        outputs_val = sess.run([loss, accuracy, train_op, global_step],
                           feed_dict = {
                               inputs: batch_inputs,
                               outputs: batch_labels,
                               keep_prob: train_keep_prob_value
                           })
        loss_val, accuarcy_val, _, global_step_val = outputs_val
        if global_step_val % 20 == 0:
            tf.logging.info("Step: %5d, loss: %3.3f, accuracy: %3.5f"
                           % (global_step_val, loss_val, accuarcy_val))
            

INFO:tensorflow:Step:    20, loss: 2.252, accuracy: 0.25000
INFO:tensorflow:Step:    40, loss: 2.144, accuracy: 0.29000
INFO:tensorflow:Step:    60, loss: 1.972, accuracy: 0.41000
INFO:tensorflow:Step:    80, loss: 1.693, accuracy: 0.54000
INFO:tensorflow:Step:   100, loss: 1.452, accuracy: 0.53000
INFO:tensorflow:Step:   120, loss: 1.121, accuracy: 0.68000
INFO:tensorflow:Step:   140, loss: 1.202, accuracy: 0.71000
INFO:tensorflow:Step:   160, loss: 0.850, accuracy: 0.78000
INFO:tensorflow:Step:   180, loss: 0.606, accuracy: 0.82000
INFO:tensorflow:Step:   200, loss: 0.743, accuracy: 0.77000
INFO:tensorflow:Step:   220, loss: 0.536, accuracy: 0.85000
INFO:tensorflow:Step:   240, loss: 0.427, accuracy: 0.89000
INFO:tensorflow:Step:   260, loss: 0.556, accuracy: 0.83000
INFO:tensorflow:Step:   280, loss: 0.534, accuracy: 0.87000
INFO:tensorflow:Step:   300, loss: 0.314, accuracy: 0.92000
INFO:tensorflow:Step:   320, loss: 0.487, accuracy: 0.86000
INFO:tensorflow:Step:   340, loss: 0.464

KeyboardInterrupt: 