In [None]:
import tensorflow as tf
import numpy as np
import os
import sys
stdout = sys.stdout
reload(sys)
sys.stdout = stdout

import cPickle as pkl

In [None]:
class QACNN(object):
    def __init__(self, config):
        self.config = config
        # 输入
        self.add_placeholders()
        # [batch_size, sequence_size, embed_size]
        q_embed, aplus_embed, aminus_embed = self.add_embeddings()
        # [batch_size, sequence_size, hidden_size, 1]
        self.h_q, self.h_ap, self.h_am = self.add_hl(q_embed, aplus_embed, aminus_embed)
        # [batch_size, total_channels]
        real_pool_q, real_pool_ap, real_pool_am = self.add_model(self.h_q, self.h_ap, self.h_am)
        # [batch_size, 1]
        self.q_ap_cosine, self.q_am_cosine = self.calc_cosine(real_pool_q, real_pool_ap, real_pool_am)
        # 损失和精确度
        self.total_loss, self.loss, self.accu = self.add_loss_op(self.q_ap_cosine, self.q_am_cosine)
        # 训练节点
        self.train_op = self.add_train_op(self.total_loss)


    # 输入
    def add_placeholders(self):
        # 问题
        self.q = tf.placeholder(np.int32,
                shape=[None, self.config.max_q_length],
                name='Question')
        # 正向回答
        self.aplus = tf.placeholder(np.int32,
                shape=[None, self.config.max_a_length],
                name='PosAns')
        # 负向回答
        self.aminus = tf.placeholder(np.int32,
                shape=[None, self.config.max_a_length],
                name='NegAns')
        # drop_out
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.batch_size = tf.shape(self.q)[0]

    # word embeddings
    def add_embeddings(self):
        with tf.variable_scope('embedding'):
            embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer())
            q_embed = tf.nn.embedding_lookup(embeddings, self.q)
            aplus_embed = tf.nn.embedding_lookup(embeddings, self.aplus)
            aminus_embed = tf.nn.embedding_lookup(embeddings, self.aminus)
            return q_embed, aplus_embed, aminus_embed

    # Hidden Layer
    def add_hl(self, q_embed, aplus_embed, aminus_embed):
        with tf.variable_scope('HL'):
            W = tf.get_variable('weights', shape=[self.config.embedding_size, self.config.hidden_size], initializer=tf.uniform_unit_scaling_initializer())
            b = tf.get_variable('biases', initializer=tf.constant(0.1, shape=[self.config.hidden_size]))
            h_q = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(q_embed, [-1, self.config.embedding_size]), W)+b), [self.config.batch_size, self.config.max_q_length, -1])
            h_ap = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aplus_embed, [-1, self.config.embedding_size]), W)+b), [self.config.batch_size, self.config.max_a_length, -1])
            h_am = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aminus_embed, [-1, self.config.embedding_size]), W)+b), [self.config.batch_size, self.config.max_a_length, -1])
            tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(W))
            # print 'h_q[shape]:', tf.shape(h_q)
            # print 'h_ap[shape]:', tf.shape(h_ap)
            # print 'h_am[shape]:', tf.shape(h_am)
            return h_q, h_ap, h_am

    # CNN层
    def add_model(self, h_q, h_ap, h_am):
        pool_q = list()
        pool_ap = list()
        pool_am = list()
        h_q = tf.reshape(h_q, [-1, self.config.max_q_length, self.config.hidden_size, 1])
        h_ap = tf.reshape(h_ap, [-1, self.config.max_a_length, self.config.hidden_size, 1])
        h_am = tf.reshape(h_am, [-1, self.config.max_a_length, self.config.hidden_size, 1])
        for i, filter_size in enumerate(self.config.filter_sizes):
            with tf.variable_scope('filter{}'.format(filter_size)):
                # filter的W和b
                conv1_W = tf.get_variable('W', shape=[filter_size, self.config.hidden_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .1))
                conv1_b = tf.get_variable('conv_b', initializer=tf.constant(0.1, shape=[self.config.num_filters]))
                # pooling层的bias,Q和A分开
                pool_qb = tf.get_variable('pool_qb', initializer=tf.constant(0.1, shape=[self.config.num_filters]))
                pool_ab = tf.get_variable('pool_ab', initializer=tf.constant(0.1, shape=[self.config.num_filters]))
                # 卷积
                out_q = tf.nn.relu((tf.nn.conv2d(h_q, conv1_W, [1,1,1,1], padding='VALID')+conv1_b))
                # 池化
                out_q = tf.nn.max_pool(out_q, [1,self.config.max_q_length-filter_size+1,1,1], [1,1,1,1], padding='VALID')
                out_q = tf.nn.tanh(out_q+pool_qb)
                pool_q.append(out_q)

                out_ap = tf.nn.relu((tf.nn.conv2d(h_ap, conv1_W, [1,1,1,1], padding='VALID')+conv1_b))
                out_ap = tf.nn.max_pool(out_ap, [1,self.config.max_a_length-filter_size+1,1,1], [1,1,1,1], padding='VALID')
                out_ap = tf.nn.tanh(out_ap+pool_ab)
                pool_ap.append(out_ap)

                out_am = tf.nn.relu((tf.nn.conv2d(h_am, conv1_W, [1,1,1,1], padding='VALID')+conv1_b))
                out_am = tf.nn.max_pool(out_am, [1,self.config.max_a_length-filter_size+1,1,1], [1,1,1,1], padding='VALID')
                out_am = tf.nn.tanh(out_am+pool_ab)
                pool_am.append(out_am)

                # 加入正则项
                tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(conv1_W))

        total_channels = len(self.config.filter_sizes)*self.config.num_filters

        real_pool_q = tf.reshape(tf.concat(3, pool_q), [-1, total_channels])
        real_pool_ap = tf.reshape(tf.concat(3, pool_ap), [-1, total_channels])
        real_pool_am = tf.reshape(tf.concat(3, pool_am), [-1, total_channels])
        # print 'real_pool_q[shape]:', tf.shape(real_pool_q)
        # print 'real_pool_ap[shape]:', tf.shape(real_pool_ap)
        # print 'real_pool_am[shape]:', tf.shape(real_pool_am)

        return real_pool_q, real_pool_ap, real_pool_am

    # 计算cosine
    def calc_cosine(self, real_pool_q, real_pool_ap, real_pool_am):
        len_pool_q = tf.sqrt(tf.reduce_sum(tf.pow(real_pool_q, 2), [1]))
        len_pool_ap = tf.sqrt(tf.reduce_sum(tf.pow(real_pool_ap, 2), [1]))
        len_pool_am = tf.sqrt(tf.reduce_sum(tf.pow(real_pool_am, 2), [1]))
        # print 'len_pool_q[shape]:', tf.shape(len_pool_q)
        # print 'len_pool_ap[shape]:', tf.shape(len_pool_ap)
        # print 'len_pool_am[shape]:', tf.shape(len_pool_am)

        q_ap_cosine = tf.div(tf.reduce_sum(tf.mul(real_pool_q, real_pool_ap), [1]), tf.mul(len_pool_q, len_pool_ap))
        q_am_cosine = tf.div(tf.reduce_sum(tf.mul(real_pool_q, real_pool_am), [1]), tf.mul(len_pool_q, len_pool_am))

        return q_ap_cosine, q_am_cosine

    # 损失节点
    def add_loss_op(self, q_ap_cosine, q_am_cosine):
        margin = tf.constant(self.config.m, shape=[self.batch_size], dtype=tf.float32)
        # 0常量
        zero = tf.constant(0., shape=[self.batch_size], dtype=tf.float32)
        l = tf.maximum(zero, tf.add(tf.sub(margin, q_ap_cosine), q_am_cosine))
        loss = tf.reduce_sum(l)
        tf.add_to_collection('total_loss', loss)
        total_loss = tf.add_n(tf.get_collection('total_loss'))
        accu = tf.reduce_mean(tf.cast(tf.equal(zero, l), tf.float32))
        # print 'q_am_cosine[shape]:', tf.shape(q_am_cosine)
        # print 'q_ap_cosine[shape]:', tf.shape(q_ap_cosine)
        # print 'loss[shape]:', tf.shape(loss)
        # print 'accu[shape]:', tf.shape(accu)
        return total_loss, loss, accu

    # 训练节点
    def add_train_op(self, loss):
        with tf.name_scope('train_op'):
            # 记录训练步骤
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            opt = tf.train.AdamOptimizer(self.config.lr)
            train_op = opt.minimize(loss, self.global_step)
            return train_op

In [None]:
class Config(object):
    def __init__(self, vocab_size):
        # 输入问题(句子)长度
        self.max_q_length = 200
        # 输入答案长度
        self.max_a_length = 200
        # 循环数
        self.num_epochs = 100000
        # batch大小
        self.batch_size = 100
        # 词表大小
        self.vocab_size = vocab_size
        # 词向量大小
        self.embedding_size = 100
        # 不同类型的filter,相当于1-gram,2-gram,3-gram和5-gram
        self.filter_sizes = [1, 2, 3, 5]
        # 隐层大小
        self.hidden_size = 80
        # 每种filter的数量
        self.num_filters = 512
        # L2正则化,未用,没啥效果
        # 论文里给的是0.0001
        self.l2_reg_lambda = 0.
        # 弃权,未用,没啥效果
        self.keep_prob = 1.0
        # 学习率
        # 论文里给的是0.01
        self.lr = 0.01
        # margin
        # 论文里给的是0.009
        self.m = 0.05
        # 设定GPU的性质,允许将不能在GPU上处理的部分放到CPU
        # 设置log打印
        self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        # 只占用20%的GPU内存
        self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2

In [None]:
raw_data_path = '../data/WikiQA/raw'
processed_data_path = '../data/WikiQA/processed'

max_q_length = 40
max_a_length = 40

def padding(data, max_len):
    return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post')


class Iterator(object):
    """
    数据迭代器
    """
    def __init__(self, x):
        self.x = np.asarray(x)
        self.sample_num = self.x.shape[0]

    def next_batch(self, batch_size):
        # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len
        l = np.random.randint(0, self.sample_num - batch_size + 1)
        r = l + batch_size
        x_part = self.x[l:r]
        return x_part

    def next(self, batch_size, shuffle=True):
        np.random.shuffle(self.x)
        l = 0
        while l < self.sample_num:
            r = min(l + batch_size, self.sample_num)
            batch_size = r - l
            x_part = self.x[l:r]
            yield x_part


with open(os.path.join(processed_data_path, 'pairwise_corpus.pkl'), 'r') as fr:
    train_corpus, val_corpus, test_corpus = pkl.load(fr)
    
with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr:
    word2id, id2word = pkl.load(fr)
    
train_q, train_ap, train_an = zip(*train_corpus)

train_q = padding(train_q, max_q_length)
train_ap = padding(train_ap, max_a_length)
train_an = padding(train_an, max_a_length)  


config = Config(len(word2id))
config.max_q_length = max_q_length
config.max_a_length = max_a_length

train_corpus = zip(train_q, train_ap, train_an)

iterator = Iterator(train_corpus)

with tf.Session(config=config.cf) as sess:
    model = QACNN(config)
    
    sess.run(tf.initialize_all_variables())
    for epoch in config.num_epochs:
        count = 0
        for batch_x in iterator.next(config.batch_size):
            batch_q, batch_ap, batch_an = zip(*batch_x)
            print(batch_q.shape)
            print(batch_ap.shape)
            print(batch_an.shape)
            _, loss = sess.run([model.train_op, model.total_loss], 
                               feed_dict={model.q:batch_q, 
                                          model.aplus:batch_ap, 
                                          model.aminus:batch_an,
                                          model.keep_prob:config.keep_prob})
            count += 1
            if count % 100 == 0:
                print('Loss:{}'.format(loss))
    