### Poetry with RNN

##### Introduction and Preprocessing

本节参考 [斗大的熊猫](http://blog.topspeedsnail.com/archives/10542) ，使用 RNN 在唐诗数据集上进行 RNN 学习和生成

代码参考 [char-rnn](https://github.com/karpathy/char-rnn)

使用的数据集：[全唐诗(43030首)](https://pan.baidu.com/s/1o7QlUhO)

In [1]:
from collections import Counter
import codecs

class Poetry(object):
    def __init__(self, poetry_file='poetry.txt'):
        self.poetry_file = poetry_file
        print "parsing poetry file ..."
        self.poetrys = self.parse_datafile()
        print "creating lexicons ..."
        self.create_lexicon()

    def parse_datafile(self):
        poetrys = []
        with codecs.open(self.poetry_file, 'r', 'utf-8') as f:
            for line in f:
                try:
                    title, content = line.strip().split(':')
                    content = content.replace(' ', '')
                    if '_' in content or '(' in content or u'（' in content or u'《' in content or '[' in content:
                        continue
                    if len(content) < 5 or len(content) > 79:
                        continue
                    content = '[' + content + ']'
                    poetrys.append(content)
                except Exception, e:
                    print content
        sorted(poetrys, key=lambda l: len(l))
        print("Total poetrys number: {}".format(len(poetrys)))
        return poetrys

    def create_lexicon(self):
        all_words = []
        for p in self.poetrys:
            # 这里以字符为单位进行拆分，包括了前后缀 '[' 和 ']'
            all_words += [w for w in p]
        counter = Counter(all_words)
        # 按字符计数，按计数倒序
        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
        # 去按计数倒序排列的 words，得到 lexicon
        words, _ = zip(*count_pairs)

        # words 加上一个 ' ' 空格字符，形如 ['[', 'a', 'f', 'g', 'h', 'i', ']', ' ']
        self.words = words + (' ',)
        # 字符 ==> 索引 id，形如 {'[':0, 'a':1, 'f':2, 'g':3, 'h':4, 'i':5, ']':6, ' ':7}
        self.word_num_map = dict(zip(self.words, range(len(self.words))))
        # 把 poetrys 转为为索引形式: poetrys 形如 ['[afg]', '[aghi]']
        # 得到的结果 poetrys_vector 形如：[[0, 1, 2, 3, 6], [0, 1, 3, 4, 5, 6]]; 不在 map 中的词设为最后一位，对应 ' '
        self.poetrys_vector = [[self.word_num_map.get(w, len(self.words)) for w in p] for p in self.poetrys]

    def split_batch_data(self, batch_size=64):
        # total batch number = total peotrys number // batch size，截断，而不是 4 舍 5 入
        n_chunk = len(self.poetrys_vector) // batch_size
        x_batches = []
        y_batches = []
        # 前面的截断，保证了这里每个 batch 都是 batch_size 个 poetrys
        for i in range(n_chunk):
            start_index = i * batch_size
            end_index = start_index + batch_size
            batch = self.poetrys_vector[start_index: end_index]
            # batch 中最长的一个 poetry 的长度，注意这个不是所有 poetrys 的最长长度，而是 batch 的；故此不同 batch 此值会不同
            length = max(map(len, batch))
            # 长度不足的句子，使用空格对应的 index 来填补
            xdata = np.full((batch_size, length), self.word_num_map[' '], np.int32)
            for i in range(batch_size):
                xdata[i, :len(batch[i])] = batch[i]
            """
            xdata             ydata
            [6,2,4,6,9]       [2,4,6,9,9]
            [1,4,2,8,5]       [4,2,8,5,5]
            ydata 就是 xdata 中的每个句子向后错一位，也就是说通过 xdata 的句子的任何一个字预测其后面一个字
            当然，到了最后一个字，就只能预测它自己了
            """
            ydata = np.copy(xdata)
            ydata[:, :-1] = xdata[:, 1:]
            x_batches.append(xdata)
            y_batches.append(ydata)

        return x_batches, y_batches

In [2]:
import tensorflow as tf
import numpy as np

In [3]:
tf.__version__

'0.9.0'

##### Part I. 探索数据集

In [4]:
poetry = Poetry('data/poetry.txt')
batch_size = 64
x_batches, y_batches = poetry.split_batch_data(batch_size=batch_size)
n_chunk = len(x_batches)
print len(x_batches), len(x_batches[0])
print len(y_batches), len(y_batches[0])
print n_chunk

parsing poetry file ...
[渐老风光不著人，花溪柳陌早逢春。近来行到门前少，趁暖闲眠似病人。]
Total poetrys number: 34646
creating lexicons ...
541 64
541 64
541


看到我们把文件中的有效唐诗分成 541 个 batch；每个 batch 中为 64 个唐诗文本，以及通过这些唐诗文本错位得到的下一个字的结果集

In [5]:
total_words = len(poetry.words)
unique_words = len(poetry.word_num_map)
print poetry.words[0: 10]
print total_words, unique_words, len(poetry.poetrys_vector)

(u'\uff0c', u'\u3002', u']', u'[', u'\u4e0d', u'\u4eba', u'\u5c71', u'\u98ce', u'\u65e5', u'\u65e0')
6110 6110 34646


看到，最多出现的字符为 (u'，', u'。', u']', u'[', u'不', u'人', u'山', u'风', u'日', u'无')

共计 6110 个字，共计 34646 首唐诗。  541 * 64 = 34624，说明由于截断的作用，最后有 22 首唐诗没有被收录


##### Part II. Layer definations

In [6]:
input_data = tf.placeholder(tf.int32, [batch_size, None])
output_targets = tf.placeholder(tf.int32, [batch_size, None])

In [9]:
def neural_network(model='lstm', rnn_size=128, num_layers=2):
    if model == 'rnn':
        cell_fun = tf.nn.rnn_cell.BasicRNNCell
    elif model == 'gru':
        cell_fun = tf.nn.rnn_cell.GRUCell
    elif model == 'lstm':
        cell_fun = tf.nn.rnn_cell.BasicLSTMCell

    # num_units = rnn_size，是 rnn 输入的 length，后面有详解
    # state_is_tuple
    #    - If True, accepted and returned states are 2-tuples of the c_state and m_state. 
    #    - If False, they are concatenated along the column axis.
    # cell = cell_fun(rnn_size, state_is_tuple=True)    # tf version 0.9 will raise exception - LSTMStateTuple invalid type <type 'tuple'>, must be a string or Tensor.
    cell = cell_fun(rnn_size, state_is_tuple=False)
    # 默认为双层的 RNN (stacked RNN)
    # cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
    cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=False)
    # 注意第一个参数为 batch_size，就是说为 batch 中的每一条数据都准备一个 state; 这个其实就相当于 RNN 中的隐藏变量
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    scopename = 'rnnlm1'
    with tf.variable_scope(scopename):
        """
        定义一个 softmax 层，核心是根据一个字 (rnn_size 维) 去预测下一个字； rnn_size 下面介绍
        故此 W 为 rnn_size * len(poetry.words) 维度
        """
        softmax_w = tf.get_variable("softmax_w", [rnn_size, len(poetry.words)+1])
        softmax_b = tf.get_variable("softmax_b", [len(poetry.words)+1])
        """
        定义一个 embedding 层，前面我们知道词典共计 len(poetry.words) 个词，故此每个词可以表示为 len(poetry.words) 维的 one-hot 矢量
        但是我们想得到的是一个 rnn_size 维度的矢量，故此需要一个 look_up 矩阵，为 len(poetry.words) x rnn_size 维度
        
        然后，input_data 为 batch_size  x  len(max_length_in_batch) 维度的数据，也就是说共计 batch_size 条数据，每个数据都等长，长度为 batch 中最长的句子
        注意 tf.nn.embedding_lookup 要求 input_data 中的数据都是等长的，否则报错
        而且，input_data 中每条数据中的样子类似 [6,2,4,6,9] ，其中每个数字代表数据中的字对应到词典中的 index
        
        这样， tf.nn.embedding_lookup 就根据 index 去词典中 lookup，得到对应字的 embedded_vector
        故此，最后得到的结果为 batch_size x len(max_length_in_batch) x rnn_size 维度
        就是说共计 batch_size 条数据，每个数据中有 len(max_length_in_batch) 个字，每个字对应的嵌入矢量为 rnn_size 维
        
        最后，这里保守的取了 len(poetry.words) + 1，其实最后也用不到，没关系
        """
        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [len(poetry.words)+1, rnn_size])
            inputs = tf.nn.embedding_lookup(embedding, input_data)
        
    """
    embedding 层得到 input 接到这里，也就是说接在 RNN 层上
    注意 dynamic_rnn 会动态展开输入数据，提升效率，见 http://jialin114.wang/2016/09/02/dynamic-rnn/
    """
    outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope=scopename)
    # 把输出转为 rnn_size 的矢量
    output = tf.reshape(outputs, [-1, rnn_size])
    # 这时才接入 softmax 层
    logits = tf.matmul(output, softmax_w) + softmax_b
    probs = tf.nn.softmax(logits)
    return logits, last_state, probs, cell, initial_state

##### Part III. Trainning

In [None]:
logits, last_state, _, _, _ = neural_network()
# 转为 1 维
targets = tf.reshape(output_targets, [-1])
"""
通过 seq2seq 的 sequence_loss_by_example 来得到 loss
logits 为 batch_size x num_labels 的数组的数组，故此在外面加了个 []，就是说这个函数可以实现多个 batch 的同时计算 loss，只不过我们这里只计算一个 batch
targets 为一维序列化的结果，我们看到是在上一步中转化为 1 维的
tf.ones_like(targets, dtype=tf.float32) 是个一维权重，和 targets 有相同的维度 (xxx_like)
"""
loss = tf.nn.seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(poetry.words))
cost = tf.reduce_mean(loss)

learning_rate = tf.Variable(0.0, trainable=False)
# 取出可训练变量
tvars = tf.trainable_variables()
# 修正梯度值，用于控制梯度爆炸的问题
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)    # 这里报错  'NoneType' object has no attribute 'grad_context'
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, tvars))

# optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver(tf.all_variables())
    
    for epoch in range(50):
        sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
        for batch in range(n_chunk):
            train_loss, _, _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[batch], output_targets: y_batches[batch]})
            print("epoch {}, batch {}, train_loss {}".format(epoch, batch, train_loss))
        if epoch > 0 and epoch % 7 == 0:
            saver.save(sess, 'poetry.module', global_step=epoch)



epoch 0, batch 0, train_loss 8.74556541443
epoch 0, batch 1, train_loss 8.74578857422
epoch 0, batch 2, train_loss 7.99591207504
epoch 0, batch 3, train_loss 7.80856180191
epoch 0, batch 4, train_loss 6.79696178436
epoch 0, batch 5, train_loss 6.39533138275
epoch 0, batch 6, train_loss 7.56495714188
epoch 0, batch 7, train_loss 5.9849395752
epoch 0, batch 8, train_loss 5.21713018417
epoch 0, batch 9, train_loss 6.9343495369
epoch 0, batch 10, train_loss 5.70776367188
epoch 0, batch 11, train_loss 6.16476678848
epoch 0, batch 12, train_loss 5.43673849106
epoch 0, batch 13, train_loss 6.06676721573
epoch 0, batch 14, train_loss 4.97399902344
epoch 0, batch 15, train_loss 5.77471065521
epoch 0, batch 16, train_loss 5.15108203888
epoch 0, batch 17, train_loss 4.95245409012
epoch 0, batch 18, train_loss 4.48413991928
epoch 0, batch 19, train_loss 3.8879468441
epoch 0, batch 20, train_loss 4.79983282089
epoch 0, batch 21, train_loss 4.51987314224
epoch 0, batch 22, train_loss 5.1531496048
ep

拿着保存好的模型来生成

In [None]:
"""
先看看 np.cumsum 的定义 (累积和)
>>> a = np.array([[1,2,3], [4,5,6]])
>>> a
array([[1, 2, 3],
       [4, 5, 6]])
       
>>> np.cumsum(a)
array([ 1,  3,  6, 10, 15, 21])

>>> np.cumsum(a,axis=0)      # sum over rows for each of the 3 columns
array([[1, 2, 3],
       [5, 7, 9]])

>>> np.cumsum(a,axis=1)      # sum over columns for each of the 2 rows
array([[ 1,  3,  6],
       [ 4,  9, 15]])
"""
# 通过概率得到汉字
def to_word(prob):
    # 例如 [0.2, 0.3, 0.5] ==> [0.2, 0.5, 1.0]，这样，如果随机一个数为 0.7，就落在 0.5 和 1.0 之间，得到 1.0 的索引
    t = np.cumsum(prob)     
    s = np.sum(prob)
    sample = int(np.searchsorted(t, np.random.rand(1) * s))    # 找到插入点的索引
    return peotry.words[sample]

def gen_poetry(module_file):
    _, last_state, probs, cell, initial_state = neural_network()
    
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())        # 初始化参数
        saver = tf.train.Saver(tf.all_variables())     # 指定 saver 为所有的变量做保存
        saver.restore(sess, module_file)               # 这样，这个 restore 就是为所有的变量做恢复
        
        state_ = sess.run(cell.zero_state(1, tf.float32))    # 这里 1 为 batch_size，就是说只给一条数据初始化状态，训练时则是 batch_size 条数据初始化状态
        x = np.array([list(map(poetry.word_num_map.get, '['))])    # x 为只有一个元素的batch，这个元素为数组，初始化为起始符号 '[' 在词典中的索引，即 x = array([[$index]])
        # 这里使用前面生成的一个元素的 state_ 来替换 neural_network 函数中定义的 batch_size 个元素的 initial_state
        [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
        # 通过初始值 '[' 预测下一个字，循环，直到出现 ']' 结束符
        word = to_word(probs_)
        poem = ''
        while word != ']':
            poem += word
            x = np.zeros((1,1))
            # 再以 word 对应的索引为初始值，来预测下一个字
            x[0,0] = peotry.word_num_map[word]
            # 这里的 initial_state 已经转移为上一步得到的 state_ 了
            [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
            word = to_word(probs_)
        return poem    

In [None]:
print(gen_poetry('poetry.module-49'))

来个藏头诗看看

In [None]:
def gen_poetry_with_head(head， module_file):
    _, last_state, probs, cell, initial_state = neural_network()
 
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
 
        saver = tf.train.Saver(tf.all_variables())
        saver.restore(sess, module_file)
 
        state_ = sess.run(cell.zero_state(1, tf.float32))
        poem = ''
        i = 0
        for word in head:
            while word != '，' and word != '。':
                poem += word
                x = np.array([list(map(poetry.word_num_map.get, word))])
                [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
                word = to_word(probs_)
            if i % 2 == 0:
                poem += '，'
            else:
                poem += '。'
            i += 1
        return poem

In [None]:
print(gen_poetry_with_head(u'一二三四', 'poetry.module-49'))