### Talking Bot with Seq2Seq RNN

本节参考 [斗大的熊猫](http://blog.topspeedsnail.com/archives/10735) ，使用 Seq2Seq RNN 模型在中文对话语料上进行 RNN 学习和生成

代码参考

- [使用深度学习打造智能聊天机器人](http://blog.csdn.net/malefactor/article/details/51901115)
- [脑洞大开：基于美剧字幕的聊天语料库建设方案](http://www.shareditor.com/blogshow/?blogId=105)
- [Seq2Seq](https://www.tensorflow.org/versions/r0.12/tutorials/seq2seq/index.html)

语料来自 [中文对白语料](https://github.com/rustch3n/dgk_lost_conv)

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
tf.__version__

'0.9.0'

In [None]:
import random
import codecs
from collections import defaultdict as dd

"""
文件格式
E
M 畹/华/吾/侄/
M 你/接/到/这/封/信/的/时/候/
M 不/知/道/大/伯/还/在/不/在/人/世/了/
E
M 咱/们/梅/家/从/你/爷/爷/起/
M 就/一/直/小/心/翼/翼/地/唱/戏/
..........
M 就/因/为/没/穿/红/让/人/赏/咱/一/纸/枷/锁/
M 爷/您/别/给/我/戴/这/纸/枷/锁/呀/
E
..........
"""
datafile = './data/shooter/dgk_shooter_min.conv'

# 特殊标记，用来填充标记对话
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 对话结束
UNK = "__UNK__"  # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]    # 在词典中居前 4 位
# 在词典中位置
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

VOCAB_SIZE = 5000
TEST_SIZE = 8000

In [None]:
def get_convs(datafile):
    """
    返回对话数组，每个对话指两个 E 之间的部分；结果类似下面
    [ ['畹华吾侄', '你接到这封信的时候', '不知道大伯还在不在人世了'],
      ['咱们梅家从你爷爷起', '就一直小心翼翼地唱戏', ......],
      ......
    ]
    """
    convs = []   # store conversation
    with codecs.open(datafile, 'r', 'utf-8') as fp:
        conv = []
        for line in fp:
            line = line.strip().replace('/', '')
            if line == '':
                continue
            # end of conversation
            if line[0] == 'E':
                if conv:
                    convs.append(conv)
                conv = []
            elif line[0] == 'M':
                conv.append(line.split(' ')[1])
    print "total conversations: {}".format(len(convs))
    return convs


In [None]:
def convs_to_qafile(convs):
    """
    把对话拆分为问答
    这个分法比较简单粗暴，故此最后结果也不会非常之好
    """
    questions = []
    answers = []
    for conv in convs:
        # 如果对话只有一句，那么无法拆分
        if len(conv) == 1:
            continue
        # 如果奇数对话，那么转为偶数，扔掉最后一句
        if len(conv) % 2 != 0:
            conv = conv[: -1]
        for i, sentence in enumerate(conv):
            if i % 2 == 0:
                questions.append(sentence)
            else:
                answers.append(sentence)
    print "Total questions/answers: {}".format(questions)

    train_enc = codecs.open('./data/shooter/train.enc', 'w', 'utf-8')
    train_dec = codecs.open('./data/shooter/train.dec', 'w', 'utf-8')
    test_enc = codecs.open('./data/shooter/test.enc', 'w', 'utf-8')
    test_dec = codecs.open('./data/shooter/test.dec', 'w', 'utf-8')

    vocab_enc = codecs.open('./data/shooter/vocab.enc', 'w', 'utf-8')
    vocab_dec = codecs.open('./data/shooter/vocab.dec', 'w', 'utf-8')
    words_enc = dd(int)
    words_dec = dd(int)

    # 取出 TEST_SIZE 个作为测试集
    test_index = random.sample([i for i in range(len(questions))], TEST_SIZE)
    for i, question in enumerate(questions):
        # 分别统计 q / a 的词频
        for w in question:
            words_enc[w] += 1
        for w in answers[i]:
            words_dec[w] += 1

        # 把 q / a 划分到 train / test 集
        if i in test_index:
            test_enc.write(question + '\n')
            test_dec.write(answers[i] + '\n')
        else:
            train_enc.write(question + '\n')
            train_dec.write(answers[i] + '\n')
        if i % 1000 == 0:
            print "{} qa pairs processed".format(i)
    train_enc.close()
    train_dec.close()
    test_enc.close()
    test_dec.close()

    for words, vocabfp in [(words_enc, vocab_enc), (words_dec, vocab_dec)]:
        # 把字符按出现次数倒序排列，并在前面加上特殊字符
        ordered_vocab = START_VOCABULART + sorted(words, key=words.get, reverse=True)
        # 取前 VOCAB_SIZE 个常见字，这里其实可以做更多的数据梳理
        ordered_vocab = ordered_vocab[: VOCAB_SIZE]
        for w in ordered_vocab:
            vocabfp.write(w + '\n')


In [None]:
def convert_to_vector(infile, vocabfile, outfile):
    vocabs = []
    with codecs.open(vocabfile, 'r', 'utf-8') as f:
        for line in f:
            vocabs.append(line.strip())
    vocabs = dict([(x, y) for (y, x) in enumerate(vocabs)])

    with open(outfile, 'w') as outfp:
        with codecs.open(infile, 'r', 'utf-8') as infp:
            for line in infp:
                # 把每行句子转为矢量保存
                vec = []
                for w in line.strip():
                    vec.append(vocabs.get(w, UNK_ID))
                # 索引之间空格相隔
                outfp.write(" ".join([str(idx) for idx in vec]) + '\n')


In [None]:
buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
layer_size = 256
num_layers = 3
batch_size = 64

def s2s_read_data(enc_path, dec_path, max_size=None):
    data_set = [[] for _ in buckets]
