diff --git a/._download.sh b/._download.sh new file mode 100755 index 0000000..c830264 Binary files /dev/null and b/._download.sh differ diff --git a/.gitignore b/.gitignore index 1a31245..f3cd12a 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,7 @@ ENV/ code/models/* code/logs/* data/* - +*/models/ +models/ +*.log +*.DS_Store diff --git a/README.md b/README.md index 2192901..d1b5f93 100755 --- a/README.md +++ b/README.md @@ -9,7 +9,11 @@ WikiQA, TrecQA, InsuranceQA #### data preprocess on WikiQA -`run preprocess_wiki.ipynb` + +``` +bash download.sh +python preprocess_wiki.py +``` ### Pointwise Style @@ -17,6 +21,8 @@ WikiQA, TrecQA, InsuranceQA This model is a simple complementation of a Siamese NN QA model with a pointwise way. +[To this repo](./siamese_nn) + ##### train model `python siamese.py --train --model NN` @@ -29,6 +35,8 @@ This model is a simple complementation of a Siamese NN QA model with a pointwise This model is a simple complementation of a Siamese CNN QA model with a pointwise way. +[To this repo](./siamese_cnn) + ##### train model `python siamese.py --train --model CNN` @@ -41,6 +49,8 @@ This model is a simple complementation of a Siamese CNN QA model with a pointwis This model is a simple complementation of a Siamese RNN/LSTM/GRU QA model with a pointwise way. +[To this repo](./siamese_rnn) + ##### train model `python siamese.py --train --model RNN` @@ -60,6 +70,10 @@ All these three models above are based on the vanilla siamese structure. You can Given a question, a positive answer and a negative answer, this pairwise model can rank two answers with higher ranking in terms of the right answer. +Refer to 《APPLYING DEEP LEARNING TO ANSWER SELECTION:A STUDY AND AN OPEN TASK》 + +[To this repo](./qacnn) + ##### train model `python qacnn.py --train` @@ -68,11 +82,49 @@ Given a question, a positive answer and a negative answer, this pairwise model c `python qacnn.py --test` -### Listwise Style +### Listwise Style(also can be transformed to pointwise style) -#### Compare-Aggregate model +#### Decomposable Attention Model -To be done +Refer to 《A Decomposable Attention Model for Natural Language Inference》 + +[To this repo](./decomposable_att_model) + +##### train model + +`python decomp_att.py --train` + +##### test model + +`python decomp_att.py --test` + +#### Compare-Aggregate Model with Multi-Compare + +Refer to 《A COMPARE-AGGREGATE MODEL FOR MATCHING TEXT SEQUENCES》 + +[To this repo](./seq_match_seq) + +##### train model + +`python seq_match_seq.py --train` + +##### test model + +`python seq_match_seq.py --test` + +#### BiMPM + +Refer to 《Bilateral Multi-Perspective Matching for Natural Language Sentence》 + +[To this repo](./bimpm) + +##### train model + +`python bimpm.py --train` + +##### test model + +`python bimpm.py --test` ## Machine Reading Comprehension @@ -104,6 +156,12 @@ SQuAD, MS MARCO To be done +#### QANet + +Refer to 《QANet: Combining Local Convolution with Global Self-Attention for Reading Comprehension》 + +[To this repo](./QANet) + ### Answer Selection Style #### Dataset @@ -112,4 +170,4 @@ RACE dataset ## Information -For more information, please visit http://skyhigh233.com/blog/2018/04/26/cqa-intro/. \ No newline at end of file +For more information, please visit http://skyhigh233.com/blog/2018/04/26/cqa-intro/. diff --git a/bimpm/README.me b/bimpm/README.me new file mode 100755 index 0000000..edc608a --- /dev/null +++ b/bimpm/README.me @@ -0,0 +1,23 @@ +# 复现《Bilateral Multi-Perspective Matching for Natural Language Sentences》中的模型完成问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/bimpm/bimpm.py b/bimpm/bimpm.py new file mode 100755 index 0000000..f135df1 --- /dev/null +++ b/bimpm/bimpm.py @@ -0,0 +1,175 @@ +# -*- encoding:utf8 -*- +import tensorflow as tf +import numpy as np +import os +import sys +from copy import deepcopy +stdout = sys.stdout +reload(sys) +sys.stdout = stdout + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +import cPickle as pkl +from utils import * +from models import BiMPM + + +class BiMPMConfig(object): + def __init__(self, vocab_size, embeddings=None): + # 输入问题(句子)长度 + self.max_q_length = 200 + # 输入答案长度 + self.max_a_length = 200 + # 循环数 + self.num_epochs = 100 + # batch大小 + self.batch_size = 128 + # 词表大小 + self.vocab_size = vocab_size + # 词向量大小 + self.embeddings = embeddings + self.embedding_size = 100 + if self.embeddings is not None: + self.embedding_size = embeddings.shape[1] + # keep_prob=1-dropout + self.keep_prob = 0.6 + # 学习率 + self.lr = 0.0003 + self.grad_clip = 1 + + self.reg = 0 + self.mem_dim = 128 + self.cov_dim = 128 + self.filter_sizes = [2, 3, 4, 5] + self.comp_type = 'mul' + + self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 + + +def train(train_corpus, config, val_corpus, eval_train_corpus=None): + iterator = Iterator(train_corpus) + + with tf.Session(config=config.cf) as sess: + model = BiMPM(config) + saver = tf.train.Saver() + sess.run(tf.initialize_all_variables()) + for epoch in xrange(config.num_epochs): + count = 0 + for batch_x in iterator.next(config.batch_size, shuffle=True): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + labels = np.asarray(labels).astype(np.int32) + _, loss = sess.run([model.train_op, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:config.keep_prob}) + count += 1 + if count % 10 == 0: + print('[epoch {}, batch {}]Loss:{}'.format(epoch, count, loss)) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) + if eval_train_corpus is not None: + train_res = evaluate(sess, model, eval_train_corpus, config) + print('[train] ' + train_res) + if val_corpus is not None: + val_res = evaluate(sess, model, val_corpus, config) + print('[eval] ' + val_res) + + +def evaluate(sess, model, corpus, config): + iterator = Iterator(corpus) + + count = 0 + total_qids = [] + total_aids = [] + total_pred = [] + total_labels = [] + total_loss = 0. + for batch_x in iterator.next(config.batch_size, shuffle=False): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + y_hat, loss = sess.run([model.y_hat, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:1.}) + y_hat = np.argmax(y_hat, axis=-1) + total_loss += loss + count += 1 + total_qids.append(batch_qids) + total_aids.append(batch_aids) + total_pred.append(y_hat) + total_labels.append(labels) + # print(batch_qids[0], [id2word[_] for _ in batch_q[0]], + # batch_aids[0], [id2word[_] for _ in batch_ap[0]]) + total_qids = np.concatenate(total_qids, axis=0) + total_aids = np.concatenate(total_aids, axis=0) + total_pred = np.concatenate(total_pred, axis=0) + total_labels = np.concatenate(total_labels, axis=0) + MAP, MRR = eval_map_mrr(total_qids, total_aids, total_pred, total_labels) + # print('Eval loss:{}'.format(total_loss / count)) + return 'MAP:{}, MRR:{}'.format(MAP, MRR) + + +def test(corpus, config): + with tf.Session(config=config.cf) as sess: + model = BiMPM(config) + saver = tf.train.Saver() + saver.restore(sess, tf.train.latest_checkpoint(model_path)) + print('[test] ' + evaluate(sess, model, corpus, config)) + + +def main(args): + max_q_length = 25 + max_a_length = 90 + + with open(os.path.join(processed_data_path, 'pointwise_corpus.pkl'), 'r') as fr: + train_corpus, val_corpus, test_corpus = pkl.load(fr) + + embeddings = build_embedding(embedding_path, word2id) + + train_qids, train_q, train_aids, train_ap, train_labels = zip(*train_corpus) + train_q = padding(train_q, max_q_length) + train_ap = padding(train_ap, max_a_length) + train_corpus = zip(train_qids, train_q, train_aids, train_ap, train_labels) + + + val_qids, val_q, val_aids, val_ap, labels = zip(*val_corpus) + val_q = padding(val_q, max_q_length) + val_ap = padding(val_ap, max_a_length) + val_corpus = zip(val_qids, val_q, val_aids, val_ap, labels) + + + test_qids, test_q, test_aids, test_ap, labels = zip(*test_corpus) + test_q = padding(test_q, max_q_length) + test_ap = padding(test_ap, max_a_length) + test_corpus = zip(test_qids, test_q, test_aids, test_ap, labels) + + config = BiMPMConfig(max(word2id.values()) + 1, embeddings=embeddings) + config.max_q_length = max_q_length + config.max_a_length = max_a_length + if args.train: + train(deepcopy(train_corpus), config, val_corpus, deepcopy(train_corpus)) + elif args.test: + test(test_corpus, config) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--train", help="whether to train", action='store_true') + parser.add_argument("--test", help="whether to test", action='store_true') + args = parser.parse_args() + + raw_data_path = '../data/WikiQA/raw' + processed_data_path = '../data/WikiQA/processed' + embedding_path = '../data/embedding/glove.6B.300d.txt' + model_path = 'models' + + with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: + word2id, id2word = pkl.load(fr) + main(args) diff --git a/bimpm/models.py b/bimpm/models.py new file mode 100755 index 0000000..804d6a1 --- /dev/null +++ b/bimpm/models.py @@ -0,0 +1,754 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +from tensorflow.python.ops import nn_ops +import numpy as np + + +class BiMPM(object): + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, a_embed = self.add_embeddings() + # 上下文编码 + q_encode, a_encode = self.context_encoding(q_embed, a_embed) + # attention层 + h_a = self.attend(q_encode, a_encode) + # compose层 + t = self.compare(a_encode, h_a) + # aggregate层 + agg_out = self.aggregate(t) + pred = self.soft_out(agg_out) + # 预测概率分布与损失 + self.y_hat, self.total_loss = self.add_loss_op(pred) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 回答 + self.a = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='Ans') + self.y = tf.placeholder(tf.int32, shape=[None, ], name='label') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, + name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', + shape=[self.config.vocab_size, self.config.embedding_size], + initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + a_embed = tf.nn.embedding_lookup(embeddings, self.a) + return q_embed, a_embed + + def context_encoding(self, q, a): + """ + q: [batch_size, q_length, embedding_dim] + a: [batch_size, a_length, embedding_dim] + """ + with tf.variable_scope('context_encoding') as scope: + q_encode = self.proj_layer(q, 'proj_layer', reuse=None) + a_encode = self.proj_layer(a, 'proj_layer', reuse=True) + return q_encode, a_encode + + + def attend(self, q, a): + """ + q: [batch_size, q_length, represent_dim] + a: [batch_size, a_length, represent_dim] + """ + q_proj = self.mlp(q, self.config.mem_dim, 1, None, + 'att_q_proj', reuse=None) + # [batch_size, q_length, a_length] + att_inner_product = tf.matmul(q_proj, tf.transpose(a, (0, 2, 1))) + # [batch_size, a_length, q_length] + q_weights = tf.nn.softmax( + tf.transpose( + att_inner_product, (0, 2, 1)), dim=-1) + output_a = tf.matmul(q_weights, q) + return output_a + + def compare(self, a, h_a): + """ + a: [batch_size, a_length, mem_dim] + a_att: [batch_size, a_length, mem_dim] + """ + if self.config.comp_type == 'mul': + out = a * h_a + else: + raise ValueError('{} method is not implemented!'.format( + self.config.comp_type)) + + return out + + def aggregate(self, t): + """ + t: [batch_size, a_length, mem_dim] + """ + pool_t = [] + for i, filter_size in enumerate(self.config.filter_sizes): + with tf.variable_scope('filter{}'.format(filter_size)): + # 卷积 + out_t = tf.layers.Conv1D(self.config.cov_dim, + filter_size, + strides=1, + padding='valid', + activation=tf.nn.relu, name='conv')(t) + # 池化 + out_t = tf.layers.MaxPooling1D( + self.config.max_a_length - filter_size + 1, + 1, name='max_pool')(out_t) + out_t = tf.reshape(out_t, + (tf.shape(out_t)[0], out_t.get_shape().as_list()[2])) + pool_t.append(out_t) + # [batch_size, n * mem_dim] + out = tf.concat(pool_t, axis=-1) + # [batch_size, mem_dim] + out = self.mlp(out, self.config.mem_dim, 1, + tf.nn.tanh, 'pre_out', use_dropout=False, reuse=None) + return out + + def soft_out(self, x): + out = self.mlp(x, 2, 1, None, + 'soft_out', use_dropout=False, reuse=None) + return out + + def mlp(self, bottom, size, layer_num, activation, name, use_dropout=True, reuse=None): + """ + bottom: 上层输入 + size: 神经元大小 + layer_num: 神经网络层数 + name: mlp的名称 + reuse: 是否复用层 + """ + now = bottom + if use_dropout: + now = tf.nn.dropout(now, keep_prob=self.keep_prob) + for i in xrange(layer_num): + now = tf.layers.dense(now, size, + activation=activation, + name=name + '_{}'.format(i), + reuse=reuse) + return now + + def proj_layer(self, seq, name, reuse=None): + out1 = self.mlp(seq, self.config.mem_dim, 1, + tf.nn.sigmoid, name + '_sigmoid', reuse=reuse) + out2 = self.mlp(seq, self.config.mem_dim, 1, + tf.nn.tanh, name + '_tanh', reuse=reuse) + out = out1 * out2 + return out + + def add_loss_op(self, pred): + """ + 损失节点 + """ + # [batch_size, 2] + y_hat = tf.nn.softmax(pred, dim=-1) + loss = tf.reduce_mean( + tf.losses.sparse_softmax_cross_entropy(self.y, pred)) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + return y_hat, total_loss + + def add_train_op(self, loss): + """ + 训练节点 + """ + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, + name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + # train_op = opt.minimize(loss, self.global_step) + train_variables = tf.trainable_variables() + grads_vars = opt.compute_gradients(loss, train_variables) + for i, (grad, var) in enumerate(grads_vars): + grads_vars[i] = ( + tf.clip_by_norm(grad, self.config.grad_clip), var) + train_op = opt.apply_gradients( + grads_vars, global_step=self.global_step) + return train_op + + + + + + +# 以下代码参考https://github.com/zhiguowang/BiMPM/blob/master/src/layer_utils.py + +def my_lstm_layer(input_reps, lstm_dim, input_lengths=None, scope_name=None, reuse=False, is_training=True, + dropout_rate=0.2, use_cudnn=True): + ''' + :param inputs: [batch_size, seq_len, feature_dim] + :param lstm_dim: + :param scope_name: + :param reuse: + :param is_training: + :param dropout_rate: + :return: + ''' + input_reps = dropout_layer(input_reps, dropout_rate, is_training=is_training) + with tf.variable_scope(scope_name, reuse=reuse): + if use_cudnn: + inputs = tf.transpose(input_reps, [1, 0, 2]) + lstm = tf.contrib.cudnn_rnn.CudnnLSTM(1, lstm_dim, direction="bidirectional", + name="{}_cudnn_bi_lstm".format(scope_name), dropout=dropout_rate if is_training else 0) + outputs, _ = lstm(inputs) + outputs = tf.transpose(outputs, [1, 0, 2]) + f_rep = outputs[:, :, 0:lstm_dim] + b_rep = outputs[:, :, lstm_dim:2*lstm_dim] + else: + context_lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim) + context_lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim) + if is_training: + context_lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(context_lstm_cell_fw, output_keep_prob=(1 - dropout_rate)) + context_lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(context_lstm_cell_bw, output_keep_prob=(1 - dropout_rate)) + context_lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([context_lstm_cell_fw]) + context_lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([context_lstm_cell_bw]) + + (f_rep, b_rep), _ = tf.nn.bidirectional_dynamic_rnn( + context_lstm_cell_fw, context_lstm_cell_bw, input_reps, dtype=tf.float32, + sequence_length=input_lengths) # [batch_size, question_len, context_lstm_dim] + outputs = tf.concat(axis=2, values=[f_rep, b_rep]) + return (f_rep,b_rep, outputs) + +def dropout_layer(input_reps, dropout_rate, is_training=True): + if is_training: + output_repr = tf.nn.dropout(input_reps, (1 - dropout_rate)) + else: + output_repr = input_reps + return output_repr + +def cosine_distance(y1,y2, cosine_norm=True, eps=1e-6): + # cosine_norm = True + # y1 [....,a, 1, d] + # y2 [....,1, b, d] + cosine_numerator = tf.reduce_sum(tf.multiply(y1, y2), axis=-1) + if not cosine_norm: + return tf.tanh(cosine_numerator) + y1_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1), axis=-1), eps)) + y2_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y2), axis=-1), eps)) + return cosine_numerator / y1_norm / y2_norm + +def euclidean_distance(y1, y2, eps=1e-6): + distance = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1 - y2), axis=-1), eps)) + return distance + +def cross_entropy(logits, truth, mask=None): + # logits: [batch_size, passage_len] + # truth: [batch_size, passage_len] + # mask: [batch_size, passage_len] + if mask is not None: logits = tf.multiply(logits, mask) + xdev = tf.subtract(logits, tf.expand_dims(tf.reduce_max(logits, 1), -1)) + log_predictions = tf.subtract(xdev, tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(xdev),-1)),-1)) + result = tf.multiply(truth, log_predictions) # [batch_size, passage_len] + if mask is not None: result = tf.multiply(result, mask) # [batch_size, passage_len] + return tf.multiply(-1.0,tf.reduce_sum(result, -1)) # [batch_size] + +def projection_layer(in_val, input_size, output_size, activation_func=tf.tanh, scope=None): + # in_val: [batch_size, passage_len, dim] + input_shape = tf.shape(in_val) + batch_size = input_shape[0] + passage_len = input_shape[1] +# feat_dim = input_shape[2] + in_val = tf.reshape(in_val, [batch_size * passage_len, input_size]) + with tf.variable_scope(scope or "projection_layer"): + full_w = tf.get_variable("full_w", [input_size, output_size], dtype=tf.float32) + full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32) + outputs = activation_func(tf.nn.xw_plus_b(in_val, full_w, full_b)) + outputs = tf.reshape(outputs, [batch_size, passage_len, output_size]) + return outputs # [batch_size, passage_len, output_size] + +def highway_layer(in_val, output_size, activation_func=tf.tanh, scope=None): + # in_val: [batch_size, passage_len, dim] + input_shape = tf.shape(in_val) + batch_size = input_shape[0] + passage_len = input_shape[1] +# feat_dim = input_shape[2] + in_val = tf.reshape(in_val, [batch_size * passage_len, output_size]) + with tf.variable_scope(scope or "highway_layer"): + highway_w = tf.get_variable("highway_w", [output_size, output_size], dtype=tf.float32) + highway_b = tf.get_variable("highway_b", [output_size], dtype=tf.float32) + full_w = tf.get_variable("full_w", [output_size, output_size], dtype=tf.float32) + full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32) + trans = activation_func(tf.nn.xw_plus_b(in_val, full_w, full_b)) + gate = tf.nn.sigmoid(tf.nn.xw_plus_b(in_val, highway_w, highway_b)) + outputs = tf.add(tf.multiply(trans, gate), tf.multiply(in_val, tf.subtract(1.0, gate)), "y") + outputs = tf.reshape(outputs, [batch_size, passage_len, output_size]) + return outputs + +def multi_highway_layer(in_val, output_size, num_layers, activation_func=tf.tanh, scope_name=None, reuse=False): + with tf.variable_scope(scope_name, reuse=reuse): + for i in xrange(num_layers): + cur_scope_name = scope_name + "-{}".format(i) + in_val = highway_layer(in_val, output_size,activation_func=activation_func, scope=cur_scope_name) + return in_val + +def collect_representation(representation, positions): + # representation: [batch_size, node_num, feature_dim] + # positions: [batch_size, neigh_num] + return collect_probs(representation, positions) + +def collect_final_step_of_lstm(lstm_representation, lengths): + # lstm_representation: [batch_size, passsage_length, dim] + # lengths: [batch_size] + lengths = tf.maximum(lengths, tf.zeros_like(lengths, dtype=tf.int32)) + + batch_size = tf.shape(lengths)[0] + batch_nums = tf.range(0, limit=batch_size) # shape (batch_size) + indices = tf.stack((batch_nums, lengths), axis=1) # shape (batch_size, 2) + result = tf.gather_nd(lstm_representation, indices, name='last-forwar-lstm') + return result # [batch_size, dim] + +def collect_probs(probs, positions): + # probs [batch_size, chunks_size] + # positions [batch_size, pair_size] + batch_size = tf.shape(probs)[0] + pair_size = tf.shape(positions)[1] + batch_nums = tf.range(0, limit=batch_size) # shape (batch_size) + batch_nums = tf.reshape(batch_nums, shape=[-1, 1]) # [batch_size, 1] + batch_nums = tf.tile(batch_nums, multiples=[1, pair_size]) # [batch_size, pair_size] + + indices = tf.stack((batch_nums, positions), axis=2) # shape (batch_size, pair_size, 2) + pair_probs = tf.gather_nd(probs, indices) + # pair_probs = tf.reshape(pair_probs, shape=[batch_size, pair_size]) + return pair_probs + + +def calcuate_attention(in_value_1, in_value_2, feature_dim1, feature_dim2, scope_name='att', + att_type='symmetric', att_dim=20, remove_diagnoal=False, mask1=None, mask2=None, is_training=False, dropout_rate=0.2): + input_shape = tf.shape(in_value_1) + batch_size = input_shape[0] + len_1 = input_shape[1] + len_2 = tf.shape(in_value_2)[1] + + in_value_1 = dropout_layer(in_value_1, dropout_rate, is_training=is_training) + in_value_2 = dropout_layer(in_value_2, dropout_rate, is_training=is_training) + with tf.variable_scope(scope_name): + # calculate attention ==> a: [batch_size, len_1, len_2] + atten_w1 = tf.get_variable("atten_w1", [feature_dim1, att_dim], dtype=tf.float32) + if feature_dim1 == feature_dim2: atten_w2 = atten_w1 + else: atten_w2 = tf.get_variable("atten_w2", [feature_dim2, att_dim], dtype=tf.float32) + atten_value_1 = tf.matmul(tf.reshape(in_value_1, [batch_size * len_1, feature_dim1]), atten_w1) # [batch_size*len_1, feature_dim] + atten_value_1 = tf.reshape(atten_value_1, [batch_size, len_1, att_dim]) + atten_value_2 = tf.matmul(tf.reshape(in_value_2, [batch_size * len_2, feature_dim2]), atten_w2) # [batch_size*len_2, feature_dim] + atten_value_2 = tf.reshape(atten_value_2, [batch_size, len_2, att_dim]) + + + if att_type == 'additive': + atten_b = tf.get_variable("atten_b", [att_dim], dtype=tf.float32) + atten_v = tf.get_variable("atten_v", [1, att_dim], dtype=tf.float32) + atten_value_1 = tf.expand_dims(atten_value_1, axis=2, name="atten_value_1") # [batch_size, len_1, 'x', feature_dim] + atten_value_2 = tf.expand_dims(atten_value_2, axis=1, name="atten_value_2") # [batch_size, 'x', len_2, feature_dim] + atten_value = atten_value_1 + atten_value_2 # + tf.expand_dims(tf.expand_dims(tf.expand_dims(atten_b, axis=0), axis=0), axis=0) + atten_value = nn_ops.bias_add(atten_value, atten_b) + atten_value = tf.tanh(atten_value) # [batch_size, len_1, len_2, feature_dim] + atten_value = tf.reshape(atten_value, [-1, att_dim]) * atten_v # tf.expand_dims(atten_v, axis=0) # [batch_size*len_1*len_2, feature_dim] + atten_value = tf.reduce_sum(atten_value, axis=-1) + atten_value = tf.reshape(atten_value, [batch_size, len_1, len_2]) + else: + atten_value_1 = tf.tanh(atten_value_1) + # atten_value_1 = tf.nn.relu(atten_value_1) + atten_value_2 = tf.tanh(atten_value_2) + # atten_value_2 = tf.nn.relu(atten_value_2) + diagnoal_params = tf.get_variable("diagnoal_params", [1, 1, att_dim], dtype=tf.float32) + atten_value_1 = atten_value_1 * diagnoal_params + atten_value = tf.matmul(atten_value_1, atten_value_2, transpose_b=True) # [batch_size, len_1, len_2] + + # normalize + if remove_diagnoal: + diagnoal = tf.ones([len_1], tf.float32) # [len1] + diagnoal = 1.0 - tf.diag(diagnoal) # [len1, len1] + diagnoal = tf.expand_dims(diagnoal, axis=0) # ['x', len1, len1] + atten_value = atten_value * diagnoal + if mask1 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask1, axis=-1)) + if mask2 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask2, axis=1)) + atten_value = tf.nn.softmax(atten_value, name='atten_value') # [batch_size, len_1, len_2] + if remove_diagnoal: atten_value = atten_value * diagnoal + if mask1 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask1, axis=-1)) + if mask2 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask2, axis=1)) + + return atten_value + +def weighted_sum(atten_scores, in_values): + ''' + :param atten_scores: # [batch_size, len1, len2] + :param in_values: [batch_size, len2, dim] + :return: + ''' + return tf.matmul(atten_scores, in_values) + +def cal_relevancy_matrix(in_question_repres, in_passage_repres): + in_question_repres_tmp = tf.expand_dims(in_question_repres, 1) # [batch_size, 1, question_len, dim] + in_passage_repres_tmp = tf.expand_dims(in_passage_repres, 2) # [batch_size, passage_len, 1, dim] + relevancy_matrix = cosine_distance(in_question_repres_tmp,in_passage_repres_tmp) # [batch_size, passage_len, question_len] + return relevancy_matrix + +def mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask): + # relevancy_matrix: [batch_size, passage_len, question_len] + # question_mask: [batch_size, question_len] + # passage_mask: [batch_size, passsage_len] + if question_mask is not None: + relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(question_mask, 1)) + relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(passage_mask, 2)) + return relevancy_matrix + +def compute_gradients(tensor, var_list): + grads = tf.gradients(tensor, var_list) + return [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(var_list, grads)] + + + + + +# 以下代码参考https://github.com/zhiguowang/BiMPM/blob/master/src/match_utils.py + +eps = 1e-6 +def cosine_distance(y1,y2): + # y1 [....,a, 1, d] + # y2 [....,1, b, d] + cosine_numerator = tf.reduce_sum(tf.multiply(y1, y2), axis=-1) + y1_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1), axis=-1), eps)) + y2_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y2), axis=-1), eps)) + return cosine_numerator / y1_norm / y2_norm + +def cal_relevancy_matrix(in_question_repres, in_passage_repres): + in_question_repres_tmp = tf.expand_dims(in_question_repres, 1) # [batch_size, 1, question_len, dim] + in_passage_repres_tmp = tf.expand_dims(in_passage_repres, 2) # [batch_size, passage_len, 1, dim] + relevancy_matrix = cosine_distance(in_question_repres_tmp,in_passage_repres_tmp) # [batch_size, passage_len, question_len] + return relevancy_matrix + +def mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask): + # relevancy_matrix: [batch_size, passage_len, question_len] + # question_mask: [batch_size, question_len] + # passage_mask: [batch_size, passsage_len] + relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(question_mask, 1)) + relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(passage_mask, 2)) + return relevancy_matrix + +def multi_perspective_expand_for_3D(in_tensor, decompose_params): + in_tensor = tf.expand_dims(in_tensor, axis=2) #[batch_size, passage_len, 'x', dim] + decompose_params = tf.expand_dims(tf.expand_dims(decompose_params, axis=0), axis=0) # [1, 1, decompse_dim, dim] + return tf.multiply(in_tensor, decompose_params)#[batch_size, passage_len, decompse_dim, dim] + +def multi_perspective_expand_for_2D(in_tensor, decompose_params): + in_tensor = tf.expand_dims(in_tensor, axis=1) #[batch_size, 'x', dim] + decompose_params = tf.expand_dims(decompose_params, axis=0) # [1, decompse_dim, dim] + return tf.multiply(in_tensor, decompose_params) # [batch_size, decompse_dim, dim] + + +def cal_maxpooling_matching(passage_rep, question_rep, decompose_params): + # passage_representation: [batch_size, passage_len, dim] + # qusetion_representation: [batch_size, question_len, dim] + # decompose_params: [decompose_dim, dim] + + def singel_instance(x): + p = x[0] + q = x[1] + # p: [pasasge_len, dim], q: [question_len, dim] + p = multi_perspective_expand_for_2D(p, decompose_params) # [pasasge_len, decompose_dim, dim] + q = multi_perspective_expand_for_2D(q, decompose_params) # [question_len, decompose_dim, dim] + p = tf.expand_dims(p, 1) # [pasasge_len, 1, decompose_dim, dim] + q = tf.expand_dims(q, 0) # [1, question_len, decompose_dim, dim] + return cosine_distance(p, q) # [passage_len, question_len, decompose] + elems = (passage_rep, question_rep) + matching_matrix = tf.map_fn(singel_instance, elems, dtype=tf.float32) # [batch_size, passage_len, question_len, decompse_dim] + return tf.concat(axis=2, values=[tf.reduce_max(matching_matrix, axis=2), tf.reduce_mean(matching_matrix, axis=2)])# [batch_size, passage_len, 2*decompse_dim] + +def cross_entropy(logits, truth, mask): + # logits: [batch_size, passage_len] + # truth: [batch_size, passage_len] + # mask: [batch_size, passage_len] + +# xdev = x - x.max() +# return xdev - T.log(T.sum(T.exp(xdev))) + logits = tf.multiply(logits, mask) + xdev = tf.sub(logits, tf.expand_dims(tf.reduce_max(logits, 1), -1)) + log_predictions = tf.sub(xdev, tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(xdev),-1)),-1)) +# return -T.sum(targets * log_predictions) + result = tf.multiply(tf.multiply(truth, log_predictions), mask) # [batch_size, passage_len] + return tf.multiply(-1.0,tf.reduce_sum(result, -1)) # [batch_size] + +def highway_layer(in_val, output_size, scope=None): + # in_val: [batch_size, passage_len, dim] + input_shape = tf.shape(in_val) + batch_size = input_shape[0] + passage_len = input_shape[1] +# feat_dim = input_shape[2] + in_val = tf.reshape(in_val, [batch_size * passage_len, output_size]) + with tf.variable_scope(scope or "highway_layer"): + highway_w = tf.get_variable("highway_w", [output_size, output_size], dtype=tf.float32) + highway_b = tf.get_variable("highway_b", [output_size], dtype=tf.float32) + full_w = tf.get_variable("full_w", [output_size, output_size], dtype=tf.float32) + full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32) + trans = tf.nn.tanh(tf.nn.xw_plus_b(in_val, full_w, full_b)) + gate = tf.nn.sigmoid(tf.nn.xw_plus_b(in_val, highway_w, highway_b)) + outputs = trans * gate + in_val * (1.0 - gate) + outputs = tf.reshape(outputs, [batch_size, passage_len, output_size]) + return outputs + +def multi_highway_layer(in_val, output_size, num_layers, scope=None): + scope_name = 'highway_layer' + if scope is not None: scope_name = scope + for i in xrange(num_layers): + cur_scope_name = scope_name + "-{}".format(i) + in_val = highway_layer(in_val, output_size, scope=cur_scope_name) + return in_val + +def cal_max_question_representation(question_representation, atten_scores): + atten_positions = tf.argmax(atten_scores, axis=2, output_type=tf.int32) # [batch_size, passage_len] + max_question_reps = layer_utils.collect_representation(question_representation, atten_positions) + return max_question_reps + +def multi_perspective_match(feature_dim, repres1, repres2, is_training=True, dropout_rate=0.2, + options=None, scope_name='mp-match', reuse=False): + ''' + :param repres1: [batch_size, len, feature_dim] + :param repres2: [batch_size, len, feature_dim] + :return: + ''' + input_shape = tf.shape(repres1) + batch_size = input_shape[0] + seq_length = input_shape[1] + matching_result = [] + with tf.variable_scope(scope_name, reuse=reuse): + match_dim = 0 + if options.with_cosine: + cosine_value = layer_utils.cosine_distance(repres1, repres2, cosine_norm=False) + cosine_value = tf.reshape(cosine_value, [batch_size, seq_length, 1]) + matching_result.append(cosine_value) + match_dim += 1 + + if options.with_mp_cosine: + mp_cosine_params = tf.get_variable("mp_cosine", shape=[options.cosine_MP_dim, feature_dim], dtype=tf.float32) + mp_cosine_params = tf.expand_dims(mp_cosine_params, axis=0) + mp_cosine_params = tf.expand_dims(mp_cosine_params, axis=0) + repres1_flat = tf.expand_dims(repres1, axis=2) + repres2_flat = tf.expand_dims(repres2, axis=2) + mp_cosine_matching = layer_utils.cosine_distance(tf.multiply(repres1_flat, mp_cosine_params), + repres2_flat,cosine_norm=False) + matching_result.append(mp_cosine_matching) + match_dim += options.cosine_MP_dim + + matching_result = tf.concat(axis=2, values=matching_result) + return (matching_result, match_dim) + + +def match_passage_with_question(passage_reps, question_reps, passage_mask, question_mask, passage_lengths, question_lengths, + context_lstm_dim, scope=None, + with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True, + is_training=True, options=None, dropout_rate=0, forward=True): + passage_reps = tf.multiply(passage_reps, tf.expand_dims(passage_mask,-1)) + question_reps = tf.multiply(question_reps, tf.expand_dims(question_mask,-1)) + all_question_aware_representatins = [] + dim = 0 + with tf.variable_scope(scope or "match_passage_with_question"): + relevancy_matrix = cal_relevancy_matrix(question_reps, passage_reps) + relevancy_matrix = mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask) + # relevancy_matrix = layer_utils.calcuate_attention(passage_reps, question_reps, context_lstm_dim, context_lstm_dim, + # scope_name="fw_attention", att_type=options.att_type, att_dim=options.att_dim, + # remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) + + all_question_aware_representatins.append(tf.reduce_max(relevancy_matrix, axis=2,keep_dims=True)) + all_question_aware_representatins.append(tf.reduce_mean(relevancy_matrix, axis=2,keep_dims=True)) + dim += 2 + if with_full_match: + if forward: + question_full_rep = layer_utils.collect_final_step_of_lstm(question_reps, question_lengths - 1) + else: + question_full_rep = question_reps[:,0,:] + + passage_len = tf.shape(passage_reps)[1] + question_full_rep = tf.expand_dims(question_full_rep, axis=1) + question_full_rep = tf.tile(question_full_rep, [1, passage_len, 1]) # [batch_size, pasasge_len, feature_dim] + + (attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim, + passage_reps, question_full_rep, is_training=is_training, dropout_rate=options.dropout_rate, + options=options, scope_name='mp-match-full-match') + all_question_aware_representatins.append(attentive_rep) + dim += match_dim + + if with_maxpool_match: + maxpooling_decomp_params = tf.get_variable("maxpooling_matching_decomp", + shape=[options.cosine_MP_dim, context_lstm_dim], dtype=tf.float32) + maxpooling_rep = cal_maxpooling_matching(passage_reps, question_reps, maxpooling_decomp_params) + all_question_aware_representatins.append(maxpooling_rep) + dim += 2*options.cosine_MP_dim + + if with_attentive_match: + atten_scores = layer_utils.calcuate_attention(passage_reps, question_reps, context_lstm_dim, context_lstm_dim, + scope_name="attention", att_type=options.att_type, att_dim=options.att_dim, + remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) + att_question_contexts = tf.matmul(atten_scores, question_reps) + (attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim, + passage_reps, att_question_contexts, is_training=is_training, dropout_rate=options.dropout_rate, + options=options, scope_name='mp-match-att_question') + all_question_aware_representatins.append(attentive_rep) + dim += match_dim + + if with_max_attentive_match: + max_att = cal_max_question_representation(question_reps, relevancy_matrix) + (max_attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim, + passage_reps, max_att, is_training=is_training, dropout_rate=options.dropout_rate, + options=options, scope_name='mp-match-max-att') + all_question_aware_representatins.append(max_attentive_rep) + dim += match_dim + + all_question_aware_representatins = tf.concat(axis=2, values=all_question_aware_representatins) + return (all_question_aware_representatins, dim) + +def bilateral_match_func(in_question_repres, in_passage_repres, + question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, options=None): + + question_aware_representatins = [] + question_aware_dim = 0 + passage_aware_representatins = [] + passage_aware_dim = 0 + + # ====word level matching====== + (match_reps, match_dim) = match_passage_with_question(in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths, + question_lengths, input_dim, scope="word_match_forward", + with_full_match=False, with_maxpool_match=options.with_maxpool_match, + with_attentive_match=options.with_attentive_match, + with_max_attentive_match=options.with_max_attentive_match, + is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) + question_aware_representatins.append(match_reps) + question_aware_dim += match_dim + + (match_reps, match_dim) = match_passage_with_question(in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths, + passage_lengths, input_dim, scope="word_match_backward", + with_full_match=False, with_maxpool_match=options.with_maxpool_match, + with_attentive_match=options.with_attentive_match, + with_max_attentive_match=options.with_max_attentive_match, + is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) + passage_aware_representatins.append(match_reps) + passage_aware_dim += match_dim + + with tf.variable_scope('context_MP_matching'): + for i in xrange(options.context_layer_num): # support multiple context layer + with tf.variable_scope('layer-{}'.format(i)): + # contextual lstm for both passage and question + in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) + in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) + (question_context_representation_fw, question_context_representation_bw, + in_question_repres) = layer_utils.my_lstm_layer( + in_question_repres, options.context_lstm_dim, input_lengths= question_lengths,scope_name="context_represent", + reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) + (passage_context_representation_fw, passage_context_representation_bw, + in_passage_repres) = layer_utils.my_lstm_layer( + in_passage_repres, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", + reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) + + # Multi-perspective matching + with tf.variable_scope('left_MP_matching'): + (match_reps, match_dim) = match_passage_with_question(passage_context_representation_fw, + question_context_representation_fw, passage_mask, question_mask, passage_lengths, + question_lengths, options.context_lstm_dim, scope="forward_match", + with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, + with_attentive_match=options.with_attentive_match, + with_max_attentive_match=options.with_max_attentive_match, + is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) + question_aware_representatins.append(match_reps) + question_aware_dim += match_dim + (match_reps, match_dim) = match_passage_with_question(passage_context_representation_bw, + question_context_representation_bw, passage_mask, question_mask, passage_lengths, + question_lengths, options.context_lstm_dim, scope="backward_match", + with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, + with_attentive_match=options.with_attentive_match, + with_max_attentive_match=options.with_max_attentive_match, + is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) + question_aware_representatins.append(match_reps) + question_aware_dim += match_dim + + with tf.variable_scope('right_MP_matching'): + (match_reps, match_dim) = match_passage_with_question(question_context_representation_fw, + passage_context_representation_fw, question_mask, passage_mask, question_lengths, + passage_lengths, options.context_lstm_dim, scope="forward_match", + with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, + with_attentive_match=options.with_attentive_match, + with_max_attentive_match=options.with_max_attentive_match, + is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) + passage_aware_representatins.append(match_reps) + passage_aware_dim += match_dim + (match_reps, match_dim) = match_passage_with_question(question_context_representation_bw, + passage_context_representation_bw, question_mask, passage_mask, question_lengths, + passage_lengths, options.context_lstm_dim, scope="backward_match", + with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, + with_attentive_match=options.with_attentive_match, + with_max_attentive_match=options.with_max_attentive_match, + is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) + passage_aware_representatins.append(match_reps) + passage_aware_dim += match_dim + + question_aware_representatins = tf.concat(axis=2, values=question_aware_representatins) # [batch_size, passage_len, question_aware_dim] + passage_aware_representatins = tf.concat(axis=2, values=passage_aware_representatins) # [batch_size, question_len, question_aware_dim] + + if is_training: + question_aware_representatins = tf.nn.dropout(question_aware_representatins, (1 - options.dropout_rate)) + passage_aware_representatins = tf.nn.dropout(passage_aware_representatins, (1 - options.dropout_rate)) + + # ======Highway layer====== + if options.with_match_highway: + with tf.variable_scope("left_matching_highway"): + question_aware_representatins = multi_highway_layer(question_aware_representatins, question_aware_dim, + options.highway_layer_num) + with tf.variable_scope("right_matching_highway"): + passage_aware_representatins = multi_highway_layer(passage_aware_representatins, passage_aware_dim, + options.highway_layer_num) + + #========Aggregation Layer====== + aggregation_representation = [] + aggregation_dim = 0 + + qa_aggregation_input = question_aware_representatins + pa_aggregation_input = passage_aware_representatins + with tf.variable_scope('aggregation_layer'): + for i in xrange(options.aggregation_layer_num): # support multiple aggregation layer + qa_aggregation_input = tf.multiply(qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) + (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( + qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='left_layer-{}'.format(i), + reuse=False, is_training=is_training, dropout_rate=options.dropout_rate,use_cudnn=options.use_cudnn) + fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, passage_lengths - 1) + bw_rep = bw_rep[:, 0, :] + aggregation_representation.append(fw_rep) + aggregation_representation.append(bw_rep) + aggregation_dim += 2* options.aggregation_lstm_dim + qa_aggregation_input = cur_aggregation_representation# [batch_size, passage_len, 2*aggregation_lstm_dim] + + pa_aggregation_input = tf.multiply(pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) + (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( + pa_aggregation_input, options.aggregation_lstm_dim, + input_lengths=question_lengths, scope_name='right_layer-{}'.format(i), + reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) + fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, question_lengths - 1) + bw_rep = bw_rep[:, 0, :] + aggregation_representation.append(fw_rep) + aggregation_representation.append(bw_rep) + aggregation_dim += 2* options.aggregation_lstm_dim + pa_aggregation_input = cur_aggregation_representation# [batch_size, passage_len, 2*aggregation_lstm_dim] + + aggregation_representation = tf.concat(axis=1, values=aggregation_representation) # [batch_size, aggregation_dim] + + # ======Highway layer====== + if options.with_aggregation_highway: + with tf.variable_scope("aggregation_highway"): + agg_shape = tf.shape(aggregation_representation) + batch_size = agg_shape[0] + aggregation_representation = tf.reshape(aggregation_representation, [1, batch_size, aggregation_dim]) + aggregation_representation = multi_highway_layer(aggregation_representation, aggregation_dim, options.highway_layer_num) + aggregation_representation = tf.reshape(aggregation_representation, [batch_size, aggregation_dim]) + + return (aggregation_representation, aggregation_dim) + diff --git a/bimpm/run.sh b/bimpm/run.sh new file mode 100755 index 0000000..66f1f68 --- /dev/null +++ b/bimpm/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python bimpm.py --train + + +echo "test model" +python bimpm.py --test diff --git a/code/utils.py b/bimpm/utils.py similarity index 100% rename from code/utils.py rename to bimpm/utils.py diff --git a/code/models.py b/code/models.py deleted file mode 100755 index ebb652f..0000000 --- a/code/models.py +++ /dev/null @@ -1,488 +0,0 @@ -# -*- encoding:utf-8 -*- -import tensorflow as tf -import numpy as np - -class SiameseNN(object): - def __init__(self, config): - self.config = config - # 输入 - self.add_placeholders() - # [batch_size, sequence_size, embed_size] - q_embed, a_embed = self.add_embeddings() - with tf.variable_scope('siamese') as scope: - self.q_trans = self.network(q_embed) - scope.reuse_variables() - self.a_trans = self.network(a_embed) - # 损失和精确度 - self.total_loss = self.add_loss_op(self.q_trans, self.a_trans) - # 训练节点 - self.train_op = self.add_train_op(self.total_loss) - - # 输入 - def add_placeholders(self): - # 问题 - self.q = tf.placeholder(tf.int32, - shape=[None, self.config.max_q_length], - name='Question') - # 回答 - self.a = tf.placeholder(tf.int32, - shape=[None, self.config.max_a_length], - name='Ans') - self.y = tf.placeholder(tf.float32, shape=[None, ], name='label') - # drop_out - self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') - self.batch_size = tf.shape(self.q)[0] - - # word embeddings - def add_embeddings(self): - with tf.variable_scope('embedding'): - if self.config.embeddings is not None: - embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) - else: - embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) - q_embed = tf.nn.embedding_lookup(embeddings, self.q) - a_embed = tf.nn.embedding_lookup(embeddings, self.a) - q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) - a_embed = tf.nn.dropout(a_embed, keep_prob=self.keep_prob) - return q_embed, a_embed - - def network(self, x): - # (batch_size * max_len, embed_size) - max_len = tf.shape(x)[1] - x = tf.reshape(x, (-1, x.get_shape()[-1])) - fc1 = self.fc_layer(x, self.config.hidden_size, "fc1") - ac1 = tf.nn.relu(fc1) - fc2 = self.fc_layer(ac1, self.config.hidden_size, "fc2") - ac2 = tf.nn.relu(fc2) - # (batch_size, max_len, embed_size) - ac3 = tf.reshape(ac2, (self.batch_size, max_len, ac2.get_shape()[1])) - # (batch_size, embed_size) - ac3 = tf.reduce_mean(ac3, axis=1) - fc3 = self.fc_layer(ac3, self.config.output_size, "fc3") - return fc3 - - def fc_layer(self, bottom, n_weight, name): - assert len(bottom.get_shape()) == 2 - n_prev_weight = bottom.get_shape()[1] - initer = tf.truncated_normal_initializer(stddev=0.01) - W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) - b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.01, shape=[n_weight], dtype=tf.float32)) - fc = tf.nn.bias_add(tf.matmul(bottom, W), b) - return fc - - # 损失节点 - def add_loss_op(self, o1, o2): - # 此处用cos距离 - norm_o1 = tf.nn.l2_normalize(o1, dim=1) - norm_o2 = tf.nn.l2_normalize(o2, dim=1) - self.q_a_cosine = tf.reduce_sum(tf.multiply(o1, o2), 1) - - loss = self.contrastive_loss(self.q_a_cosine, self.y) - tf.add_to_collection('total_loss', loss) - total_loss = tf.add_n(tf.get_collection('total_loss')) - return total_loss - - def contrastive_loss(self, Ew, y): - l_1 = self.config.pos_weight * tf.square(1 - Ew) - l_0 = tf.square(tf.maximum(Ew, 0)) - loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) - return loss - - # 训练节点 - def add_train_op(self, loss): - with tf.name_scope('train_op'): - # 记录训练步骤 - self.global_step = tf.Variable(0, name='global_step', trainable=False) - opt = tf.train.AdamOptimizer(self.config.lr) - train_op = opt.minimize(loss, self.global_step) - return train_op - - -class SiameseCNN(object): - def __init__(self, config): - self.config = config - # 输入 - self.add_placeholders() - # [batch_size, sequence_size, embed_size] - q_embed, a_embed = self.add_embeddings() - with tf.variable_scope('siamese') as scope: - self.q_trans = self.network(q_embed, reuse=False) - scope.reuse_variables() - self.a_trans = self.network(a_embed, reuse=True) - # 损失和精确度 - self.total_loss = self.add_loss_op(self.q_trans, self.a_trans) - # 训练节点 - self.train_op = self.add_train_op(self.total_loss) - - # 输入 - def add_placeholders(self): - # 问题 - self.q = tf.placeholder(tf.int32, - shape=[None, self.config.max_q_length], - name='Question') - # 回答 - self.a = tf.placeholder(tf.int32, - shape=[None, self.config.max_a_length], - name='Ans') - self.y = tf.placeholder(tf.float32, shape=[None, ], name='label') - # drop_out - self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') - self.batch_size = tf.shape(self.q)[0] - - # word embeddings - def add_embeddings(self): - with tf.variable_scope('embedding'): - if self.config.embeddings is not None: - embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) - else: - embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) - q_embed = tf.nn.embedding_lookup(embeddings, self.q) - a_embed = tf.nn.embedding_lookup(embeddings, self.a) - q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) - a_embed = tf.nn.dropout(a_embed, keep_prob=self.keep_prob) - return q_embed, a_embed - - def network(self, x, reuse=False): - # (batch_size, conv_size) - conv1 = self.conv_layer(x, reuse=reuse) - # (batch_size, hidden_size) - fc1 = self.fc_layer(conv1, self.config.hidden_size, "fc1") - ac1 = tf.nn.relu(fc1) - # (batch_size, output_size) - fc2 = self.fc_layer(ac1, self.config.output_size, "fc2") - return fc2 - - def fc_layer(self, bottom, n_weight, name): - assert len(bottom.get_shape()) == 2 - n_prev_weight = bottom.get_shape()[1] - initer = tf.truncated_normal_initializer(stddev=0.01) - W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) - b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.0, shape=[n_weight], dtype=tf.float32)) - fc = tf.nn.bias_add(tf.matmul(bottom, W), b) - return fc - - def conv_layer(self, h, reuse=False): - pool = list() - max_len = h.get_shape()[1] - h = tf.reshape(h, [-1, max_len, h.get_shape()[2], 1]) - for i, filter_size in enumerate(self.config.filter_sizes): - with tf.variable_scope('filter{}'.format(filter_size)): - conv1_W = tf.get_variable('conv_W', shape=[filter_size, self.config.embedding_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .01)) - conv1_b = tf.get_variable('conv_b', initializer=tf.constant(0.0, shape=[self.config.num_filters])) - # pooling层的bias,Q和A分开 - pool_b = tf.get_variable('pool_b', initializer=tf.constant(0.0, shape=[self.config.num_filters])) - # 卷积 - out = tf.nn.relu((tf.nn.conv2d(h, conv1_W, [1,1,1,1], padding='VALID')+conv1_b)) - # 池化 - out = tf.nn.max_pool(out, [1,max_len-filter_size+1,1,1], [1,1,1,1], padding='VALID') - out = tf.nn.tanh(out+pool_b) - pool.append(out) - # 加入正则项 - if not reuse: - tf.add_to_collection('total_loss', 0.5 * self.config.l2_reg_lambda * tf.nn.l2_loss(conv1_W)) - - total_channels = len(self.config.filter_sizes) * self.config.num_filters - real_pool = tf.reshape(tf.concat(pool, 3), [self.batch_size, total_channels]) - return real_pool - - # 损失节点 - def add_loss_op(self, o1, o2): - # 此处用cos距离 - norm_o1 = tf.nn.l2_normalize(o1, dim=1) - norm_o2 = tf.nn.l2_normalize(o2, dim=1) - self.q_a_cosine = tf.reduce_sum(tf.multiply(o1, o2), 1) - - loss = self.contrastive_loss(self.q_a_cosine, self.y) - tf.add_to_collection('total_loss', loss) - total_loss = tf.add_n(tf.get_collection('total_loss')) - return total_loss - - def contrastive_loss(self, Ew, y): - l_1 = self.config.pos_weight * tf.square(1 - Ew) - l_0 = tf.square(tf.maximum(Ew, 0)) - loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) - return loss - - # 训练节点 - def add_train_op(self, loss): - with tf.name_scope('train_op'): - # 记录训练步骤 - self.global_step = tf.Variable(0, name='global_step', trainable=False) - opt = tf.train.AdamOptimizer(self.config.lr) - train_op = opt.minimize(loss, self.global_step) - return train_op - - -class SiameseRNN(object): - def __init__(self, config): - self.config = config - # 输入 - self.add_placeholders() - # [batch_size, sequence_size, embed_size] - q_embed, a_embed = self.add_embeddings() - with tf.variable_scope('siamese') as scope: - self.q_trans = self.network(q_embed) - tf.get_variable_scope().reuse_variables() - self.a_trans = self.network(a_embed) - # 损失和精确度 - self.total_loss = self.add_loss_op(self.q_trans, self.a_trans) - # 训练节点 - self.train_op = self.add_train_op(self.total_loss) - - # 输入 - def add_placeholders(self): - # 问题 - self.q = tf.placeholder(tf.int32, - shape=[None, self.config.max_q_length], - name='Question') - # 回答 - self.a = tf.placeholder(tf.int32, - shape=[None, self.config.max_a_length], - name='Ans') - self.y = tf.placeholder(tf.float32, shape=[None, ], name='label') - # drop_out - self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') - self.batch_size = tf.shape(self.q)[0] - - # word embeddings - def add_embeddings(self): - with tf.variable_scope('embedding'): - if self.config.embeddings is not None: - embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) - else: - embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) - q_embed = tf.nn.embedding_lookup(embeddings, self.q) - a_embed = tf.nn.embedding_lookup(embeddings, self.a) - q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) - a_embed = tf.nn.dropout(a_embed, keep_prob=self.keep_prob) - return q_embed, a_embed - - def network(self, x): - sequence_length = x.get_shape()[1] - # (batch_size, time_step, embed_size) -> (time_step, batch_size, embed_size) - inputs = tf.transpose(x, [1, 0, 2]) - inputs = tf.reshape(inputs, [-1, self.config.embedding_size]) - inputs = tf.split(inputs, sequence_length, 0) - # (batch_size, rnn_output_size) - rnn1 = self.rnn_layer(inputs) - # (batch_size, hidden_size) - fc1 = self.fc_layer(rnn1, self.config.hidden_size, "fc1") - ac1 = tf.nn.relu(fc1) - # (batch_size, output_size) - fc2 = self.fc_layer(ac1, self.config.output_size, "fc2") - return fc2 - - def fc_layer(self, bottom, n_weight, name): - assert len(bottom.get_shape()) == 2 - n_prev_weight = bottom.get_shape()[1] - initer = tf.truncated_normal_initializer(stddev=0.01) - W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) - b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.01, shape=[n_weight], dtype=tf.float32)) - fc = tf.nn.bias_add(tf.matmul(bottom, W), b) - return fc - - def rnn_layer(self, h): - if self.config.cell_type == 'lstm': - birnn_fw, birnn_bw = self.bi_lstm(self.config.rnn_size, self.config.layer_size, self.config.keep_prob) - else: - birnn_fw, birnn_bw = self.bi_gru(self.config.rnn_size, self.config.layer_size, self.config.keep_prob) - outputs_x1, _, _ = tf.contrib.rnn.static_bidirectional_rnn(birnn_fw, birnn_bw, h, dtype=tf.float32) - # (time_step, batch_size, 2*rnn_size) -> (batch_size, 2*rnn_size) - output_x1 = tf.reduce_mean(outputs_x1, 0) - return output_x1 - - def bi_lstm(self, rnn_size, layer_size, keep_prob): - - # forward rnn - with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): - lstm_fw_cell_list = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)] - lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=keep_prob) - - # backward rnn - with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): - lstm_bw_cell_list = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)] - lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=keep_prob) - - return lstm_fw_cell_m, lstm_bw_cell_m - - def bi_gru(self, rnn_size, layer_size, keep_prob): - - # forward rnn - with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): - gru_fw_cell_list = [tf.contrib.rnn.GRUCell(rnn_size) for _ in xrange(layer_size)] - gru_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(gru_fw_cell_list), output_keep_prob=keep_prob) - - # backward rnn - with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): - gru_bw_cell_list = [tf.contrib.rnn.GRUCell(rnn_size) for _ in xrange(layer_size)] - gru_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(gru_bw_cell_list), output_keep_prob=keep_prob) - - return gru_fw_cell_m, gru_bw_cell_m - - # 损失节点 - def add_loss_op(self, o1, o2): - # 此处用cos距离 - norm_o1 = tf.nn.l2_normalize(o1, dim=1) - norm_o2 = tf.nn.l2_normalize(o2, dim=1) - self.q_a_cosine = tf.reduce_sum(tf.multiply(o1, o2), 1) - - loss = self.contrastive_loss(self.q_a_cosine, self.y) - tf.add_to_collection('total_loss', loss) - total_loss = tf.add_n(tf.get_collection('total_loss')) - return total_loss - - def contrastive_loss(self, Ew, y): - l_1 = self.config.pos_weight * tf.square(1 - Ew) - l_0 = tf.square(tf.maximum(Ew, 0)) - loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) - return loss - - # 训练节点 - def add_train_op(self, loss): - with tf.name_scope('train_op'): - # 记录训练步骤 - self.global_step = tf.Variable(0, name='global_step', trainable=False) - opt = tf.train.AdamOptimizer(self.config.lr) - train_op = opt.minimize(loss, self.global_step) - return train_op - - -class QACNN(object): - """ - pairwise学习模型 - """ - def __init__(self, config): - self.config = config - # 输入 - self.add_placeholders() - # [batch_size, sequence_size, embed_size] - q_embed, aplus_embed, aminus_embed = self.add_embeddings() - # [batch_size, sequence_size, hidden_size, 1] - self.h_q, self.h_ap, self.h_am = self.add_hl(q_embed, aplus_embed, aminus_embed) - # [batch_size, total_channels] - real_pool_q, real_pool_ap, real_pool_am = self.add_model(q_embed, aplus_embed, aminus_embed) - # [batch_size, 1] - self.q_ap_cosine, self.q_am_cosine = self.calc_cosine(real_pool_q, real_pool_ap, real_pool_am) - # 损失和精确度 - self.total_loss, self.loss, self.accu = self.add_loss_op(self.q_ap_cosine, self.q_am_cosine) - # 训练节点 - self.train_op = self.add_train_op(self.total_loss) - - - # 输入 - def add_placeholders(self): - # 问题 - self.q = tf.placeholder(tf.int32, - shape=[None, self.config.max_q_length], - name='Question') - # 正向回答 - self.aplus = tf.placeholder(tf.int32, - shape=[None, self.config.max_a_length], - name='PosAns') - # 负向回答 - self.aminus = tf.placeholder(tf.int32, - shape=[None, self.config.max_a_length], - name='NegAns') - # drop_out - self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') - self.batch_size = tf.shape(self.q)[0] - - # word embeddings - def add_embeddings(self): - with tf.variable_scope('embedding'): - if self.config.embeddings is not None: - embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) - else: - embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) - q_embed = tf.nn.embedding_lookup(embeddings, self.q) - aplus_embed = tf.nn.embedding_lookup(embeddings, self.aplus) - aminus_embed = tf.nn.embedding_lookup(embeddings, self.aminus) - q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) - aplus_embed = tf.nn.dropout(aplus_embed, keep_prob=self.keep_prob) - aminus_embed = tf.nn.dropout(aminus_embed, keep_prob=self.keep_prob) - return q_embed, aplus_embed, aminus_embed - - # Hidden Layer - def add_hl(self, q_embed, aplus_embed, aminus_embed): - with tf.variable_scope('HL'): - W = tf.get_variable('weights', shape=[self.config.embedding_size, self.config.hidden_size], initializer=tf.uniform_unit_scaling_initializer()) - b = tf.get_variable('biases', initializer=tf.constant(0.1, shape=[self.config.hidden_size])) - h_q = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(q_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.max_q_length, self.config.hidden_size]) - h_ap = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aplus_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.max_a_length, self.config.hidden_size]) - h_am = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aminus_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.max_a_length, self.config.hidden_size]) - tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(W)) - return h_q, h_ap, h_am - - # CNN层 - def add_model(self, h_q, h_ap, h_am): - pool_q = list() - pool_ap = list() - pool_am = list() - h_q = tf.reshape(h_q, [-1, self.config.max_q_length, self.config.embedding_size, 1]) - h_ap = tf.reshape(h_ap, [-1, self.config.max_a_length, self.config.embedding_size, 1]) - h_am = tf.reshape(h_am, [-1, self.config.max_a_length, self.config.embedding_size, 1]) - for i, filter_size in enumerate(self.config.filter_sizes): - with tf.variable_scope('filter{}'.format(filter_size)): - conv1_W = tf.get_variable('W_q', shape=[filter_size, self.config.embedding_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .1)) - conv2_W = tf.get_variable('W_a', shape=[filter_size, self.config.embedding_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .1)) - conv1_b = tf.get_variable('conv_qb', initializer=tf.constant(0.1, shape=[self.config.num_filters])) - conv2_b = tf.get_variable('conv_ab', initializer=tf.constant(0.1, shape=[self.config.num_filters])) - # pooling层的bias,Q和A分开 - pool_qb = tf.get_variable('pool_qb', initializer=tf.constant(0.1, shape=[self.config.num_filters])) - pool_ab = tf.get_variable('pool_ab', initializer=tf.constant(0.1, shape=[self.config.num_filters])) - # 卷积 - out_q = tf.nn.relu((tf.nn.conv2d(h_q, conv1_W, [1,1,1,1], padding='VALID')+conv1_b)) - # 池化 - out_q = tf.nn.max_pool(out_q, [1,self.config.max_q_length-filter_size+1,1,1], [1,1,1,1], padding='VALID') - out_q = tf.nn.tanh(out_q+pool_qb) - pool_q.append(out_q) - - out_ap = tf.nn.relu((tf.nn.conv2d(h_ap, conv2_W, [1,1,1,1], padding='VALID')+conv2_b)) - out_ap = tf.nn.max_pool(out_ap, [1,self.config.max_a_length-filter_size+1,1,1], [1,1,1,1], padding='VALID') - out_ap = tf.nn.tanh(out_ap+pool_ab) - pool_ap.append(out_ap) - - out_am = tf.nn.relu((tf.nn.conv2d(h_am, conv2_W, [1,1,1,1], padding='VALID')+conv2_b)) - out_am = tf.nn.max_pool(out_am, [1,self.config.max_a_length-filter_size+1,1,1], [1,1,1,1], padding='VALID') - out_am = tf.nn.tanh(out_am+pool_ab) - pool_am.append(out_am) - - # 加入正则项 - tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(conv1_W)) - tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(conv2_W)) - - total_channels = len(self.config.filter_sizes)*self.config.num_filters - real_pool_q = tf.reshape(tf.concat(pool_q, 3), [-1, total_channels]) - real_pool_ap = tf.reshape(tf.concat(pool_ap, 3), [-1, total_channels]) - real_pool_am = tf.reshape(tf.concat(pool_am, 3), [-1, total_channels]) - - return real_pool_q, real_pool_ap, real_pool_am - - # 计算cosine - def calc_cosine(self, real_pool_q, real_pool_ap, real_pool_am): - normalized_q_h_pool = tf.nn.l2_normalize(real_pool_q, dim=1) - normalized_pos_h_pool = tf.nn.l2_normalize(real_pool_ap, dim=1) - normalized_neg_h_pool = tf.nn.l2_normalize(real_pool_am, dim=1) - q_ap_cosine = tf.reduce_sum(tf.multiply(normalized_q_h_pool, normalized_pos_h_pool), 1) - q_am_cosine = tf.reduce_sum(tf.multiply(normalized_q_h_pool, normalized_neg_h_pool), 1) - - return q_ap_cosine, q_am_cosine - - # 损失节点 - def add_loss_op(self, q_ap_cosine, q_am_cosine): - original_loss = self.config.m - q_ap_cosine + q_am_cosine - l = tf.maximum(tf.zeros_like(original_loss), original_loss) - loss = tf.reduce_sum(l) - tf.add_to_collection('total_loss', loss) - total_loss = tf.add_n(tf.get_collection('total_loss')) - accu = tf.reduce_mean(tf.cast(tf.equal(0., l), tf.float32)) - return total_loss, loss, accu - - # 训练节点 - def add_train_op(self, loss): - with tf.name_scope('train_op'): - # 记录训练步骤 - self.global_step = tf.Variable(0, name='global_step', trainable=False) - opt = tf.train.AdamOptimizer(self.config.lr) - train_op = opt.minimize(loss, self.global_step) - return train_op diff --git a/decomposable_att_model/README.me b/decomposable_att_model/README.me new file mode 100755 index 0000000..bd58fa0 --- /dev/null +++ b/decomposable_att_model/README.me @@ -0,0 +1,23 @@ +# 复现《A Decomposable Attention Model for Natural Language Inference》中的模型完成问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/decomposable_att_model/decomp_att.py b/decomposable_att_model/decomp_att.py new file mode 100755 index 0000000..bf3ea38 --- /dev/null +++ b/decomposable_att_model/decomp_att.py @@ -0,0 +1,176 @@ +# -*- encoding:utf8 -*- +import tensorflow as tf +import numpy as np +import os +import sys +from copy import deepcopy +stdout = sys.stdout +reload(sys) +sys.stdout = stdout + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +import cPickle as pkl +from utils import * +from models import DecompAtt + + +class DecompAttConfig(object): + def __init__(self, vocab_size, embeddings=None): + # 输入问题(句子)长度 + self.max_q_length = 200 + # 输入答案长度 + self.max_a_length = 200 + # 循环数 + self.num_epochs = 100 + # batch大小 + self.batch_size = 128 + # 词表大小 + self.vocab_size = vocab_size + # 词向量大小 + self.embeddings = embeddings + self.embedding_size = 100 + if self.embeddings is not None: + self.embedding_size = embeddings.shape[1] + # RNN单元类型和大小与堆叠层数 + self.cell_type = 'GRU' + self.rnn_size = 128 + self.layer_size = 1 + # 隐层大小 + self.hidden_size = 128 + self.output_size = 128 + # keep_prob=1-dropout + self.keep_prob = 0.6 + # 学习率 + self.lr = 0.0003 + self.grad_clip = 1. + + self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 + + +def train(train_corpus, config, val_corpus, eval_train_corpus=None): + iterator = Iterator(train_corpus) + + with tf.Session(config=config.cf) as sess: + model = DecompAtt(config) + saver = tf.train.Saver() + sess.run(tf.initialize_all_variables()) + for epoch in xrange(config.num_epochs): + count = 0 + for batch_x in iterator.next(config.batch_size, shuffle=True): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + labels = np.asarray(labels).astype(np.int32) + _, loss = sess.run([model.train_op, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:config.keep_prob}) + count += 1 + if count % 10 == 0: + print('[epoch {}, batch {}]Loss:{}'.format(epoch, count, loss)) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) + if eval_train_corpus is not None: + train_res = evaluate(sess, model, eval_train_corpus, config) + print('[train] ' + train_res) + if val_corpus is not None: + val_res = evaluate(sess, model, val_corpus, config) + print('[eval] ' + val_res) + + +def evaluate(sess, model, corpus, config): + iterator = Iterator(corpus) + + count = 0 + total_qids = [] + total_aids = [] + total_pred = [] + total_labels = [] + total_loss = 0. + for batch_x in iterator.next(config.batch_size, shuffle=False): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + y_hat, loss = sess.run([model.y_hat, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:1.}) + y_hat = np.argmax(y_hat, axis=-1) + total_loss += loss + count += 1 + total_qids.append(batch_qids) + total_aids.append(batch_aids) + total_pred.append(y_hat) + total_labels.append(labels) + # print(batch_qids[0], [id2word[_] for _ in batch_q[0]], + # batch_aids[0], [id2word[_] for _ in batch_ap[0]]) + total_qids = np.concatenate(total_qids, axis=0) + total_aids = np.concatenate(total_aids, axis=0) + total_pred = np.concatenate(total_pred, axis=0) + total_labels = np.concatenate(total_labels, axis=0) + MAP, MRR = eval_map_mrr(total_qids, total_aids, total_pred, total_labels) + # print('Eval loss:{}'.format(total_loss / count)) + return 'MAP:{}, MRR:{}'.format(MAP, MRR) + + +def test(corpus, config): + with tf.Session(config=config.cf) as sess: + model = DecompAtt(config) + saver = tf.train.Saver() + saver.restore(sess, tf.train.latest_checkpoint(model_path)) + print('[test] ' + evaluate(sess, model, corpus, config)) + + +def main(args): + max_q_length = 25 + max_a_length = 90 + + with open(os.path.join(processed_data_path, 'pointwise_corpus.pkl'), 'r') as fr: + train_corpus, val_corpus, test_corpus = pkl.load(fr) + + embeddings = build_embedding(embedding_path, word2id) + + train_qids, train_q, train_aids, train_ap, train_labels = zip(*train_corpus) + train_q = padding(train_q, max_q_length) + train_ap = padding(train_ap, max_a_length) + train_corpus = zip(train_qids, train_q, train_aids, train_ap, train_labels) + + + val_qids, val_q, val_aids, val_ap, labels = zip(*val_corpus) + val_q = padding(val_q, max_q_length) + val_ap = padding(val_ap, max_a_length) + val_corpus = zip(val_qids, val_q, val_aids, val_ap, labels) + + + test_qids, test_q, test_aids, test_ap, labels = zip(*test_corpus) + test_q = padding(test_q, max_q_length) + test_ap = padding(test_ap, max_a_length) + test_corpus = zip(test_qids, test_q, test_aids, test_ap, labels) + + config = DecompAttConfig(max(word2id.values()) + 1, embeddings=embeddings) + config.max_q_length = max_q_length + config.max_a_length = max_a_length + if args.train: + train(deepcopy(train_corpus), config, val_corpus, deepcopy(train_corpus)) + elif args.test: + test(test_corpus, config) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--train", help="whether to train", action='store_true') + parser.add_argument("--test", help="whether to test", action='store_true') + args = parser.parse_args() + + raw_data_path = '../data/WikiQA/raw' + processed_data_path = '../data/WikiQA/processed' + embedding_path = '../data/embedding/glove.6B.300d.txt' + model_path = 'models' + + with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: + word2id, id2word = pkl.load(fr) + main(args) diff --git a/decomposable_att_model/models.py b/decomposable_att_model/models.py new file mode 100755 index 0000000..e9fac8b --- /dev/null +++ b/decomposable_att_model/models.py @@ -0,0 +1,203 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +import numpy as np + + +class DecompAtt(object): + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, a_embed = self.add_embeddings() + # 上下文编码 + q_encode, a_encode = self.context_encoding(q_embed, a_embed) + # attention层 + q_attend, a_attend = self.attend(q_encode, a_encode) + # compose层 + q_comp, a_comp = self.compare(q_encode, a_encode, q_attend, a_attend) + # aggregate层 + pred = self.aggregate(q_comp, a_comp) + # 预测概率分布与损失 + self.y_hat, self.total_loss = self.add_loss_op(pred) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + + def attend(self, q, a): + """ + q: [batch_size, q_length, represent_dim] + a: [batch_size, a_length, represent_dim] + """ + + q = tf.nn.dropout(q, keep_prob=self.keep_prob) + a = tf.nn.dropout(a, keep_prob=self.keep_prob) + q_map = tf.layers.dense(q, 128, activation=tf.nn.relu, name='embed_map') + a_map = tf.layers.dense(a, 128, activation=tf.nn.relu, name='embed_map', reuse=True) + # [batch_size, q_length, a_length] + att_inner_product = tf.matmul( + q_map, + tf.transpose(a_map, [0, 2, 1])) + # [batch_size, a_length, q_length] + q_weights = tf.nn.softmax( + tf.transpose( + att_inner_product, (0, 2, 1)), dim=-1) + # [batch_size, q_length, a_length] + a_weights = tf.nn.softmax(att_inner_product, dim=-1) + + output_a = tf.matmul(q_weights, q) + output_q = tf.matmul(a_weights, a) + + return output_q, output_a + + def compare(self, q, a, q_att, a_att): + """ + q: [batch_size, q_length, represent_dim] + a: [batch_size, a_length, represent_dim] + q_att: [batch_size, q_length, represent_dim] + a_att: [batch_size, a_length, represent_dim] + """ + q_combine = tf.concat([q, q_att], axis=-1) + a_combine = tf.concat([a, a_att], axis=-1) + q_combine = tf.nn.dropout(q_combine, keep_prob=self.keep_prob) + a_combine = tf.nn.dropout(a_combine, keep_prob=self.keep_prob) + q_map = self.mlp(q_combine, self.config.hidden_size, 2, 'embed_compare') + a_map = self.mlp(a_combine, self.config.hidden_size, 2, 'embed_compare', reuse=True) + return q_map, a_map + + def aggregate(self, q, a): + """ + q: [batch_size, q_length, represent_dim] + a: [batch_size, a_length, represent_dim] + """ + # 输出shape为[batch_size, represent_dim] + q_sum = tf.reduce_sum(q, 1) + a_sum = tf.reduce_sum(a, 1) + q_sum = tf.nn.dropout(q_sum, keep_prob=self.keep_prob) + a_sum = tf.nn.dropout(a_sum, keep_prob=self.keep_prob) + q_a_rep = tf.concat([q_sum, a_sum], axis=-1) + pred = self.mlp(q_a_rep, self.config.output_size, 2, 'embed_aggregate') + pred = tf.layers.dense(pred, 2, activation=None, name='prediction') + return pred + + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 回答 + self.a = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='Ans') + self.y = tf.placeholder(tf.int32, shape=[None, ], name='label') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + a_embed = tf.nn.embedding_lookup(embeddings, self.a) + return q_embed, a_embed + + def context_encoding(self, q, a): + """ + q: [batch_size, q_length, embedding_dim] + a: [batch_size, a_length, embedding_dim] + """ + with tf.variable_scope('context_encoding') as scope: + q = tf.nn.dropout(q, keep_prob=self.keep_prob) + a = tf.nn.dropout(a, keep_prob=self.keep_prob) + q_encode = self.rnn_layer(q) + tf.get_variable_scope().reuse_variables() + a_encode = self.rnn_layer(a) + return q_encode, a_encode + + def mlp(self, bottom, size, layer_num, name, reuse=None): + """ + bottom: 上层输入 + size: 神经元大小 + layer_num: 神经网络层数 + name: mlp的名称 + reuse: 是否复用层 + """ + now = bottom + for i in xrange(layer_num): + now = tf.layers.dense(now, 128, + activation=tf.nn.relu, + name=name + '_{}'.format(i), + reuse=reuse) + return now + + def rnn_layer(self, h): + sequence_length = h.get_shape()[1] + # (batch_size, time_step, embed_size) -> (time_step, batch_size, embed_size) + inputs = tf.transpose(h, [1, 0, 2]) + inputs = tf.reshape(inputs, [-1, self.config.embedding_size]) + inputs = tf.split(inputs, sequence_length, 0) + + if self.config.cell_type == 'lstm': + birnn_fw, birnn_bw = self.bi_lstm(self.config.rnn_size, self.config.layer_size, self.config.keep_prob) + else: + birnn_fw, birnn_bw = self.bi_gru(self.config.rnn_size, self.config.layer_size, self.config.keep_prob) + outputs_x1, _, _ = tf.contrib.rnn.static_bidirectional_rnn(birnn_fw, birnn_bw, inputs, dtype=tf.float32) + # (time_step, batch_size, 2*rnn_size) -> (batch_size, time_step, 2*rnn_size) + output_x1 = tf.transpose(outputs_x1, (1, 0, 2)) + return output_x1 + + def bi_lstm(self, rnn_size, layer_size, keep_prob): + # forward rnn + with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): + lstm_fw_cell_list = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)] + lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=keep_prob) + + # backward rnn + with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): + lstm_bw_cell_list = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)] + lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=keep_prob) + + return lstm_fw_cell_m, lstm_bw_cell_m + + def bi_gru(self, rnn_size, layer_size, keep_prob): + # forward rnn + with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): + gru_fw_cell_list = [tf.contrib.rnn.GRUCell(rnn_size) for _ in xrange(layer_size)] + gru_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(gru_fw_cell_list), output_keep_prob=keep_prob) + + # backward rnn + with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): + gru_bw_cell_list = [tf.contrib.rnn.GRUCell(rnn_size) for _ in xrange(layer_size)] + gru_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(gru_bw_cell_list), output_keep_prob=keep_prob) + + return gru_fw_cell_m, gru_bw_cell_m + + def add_loss_op(self, pred): + """ + 损失节点 + """ + # [batch_size, 2] + y_hat = tf.nn.softmax(pred, dim=-1) + loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(self.y, pred)) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + return y_hat, total_loss + + def add_train_op(self, loss): + """ + 训练节点 + """ + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + # train_op = opt.minimize(loss, self.global_step) + train_variables = tf.trainable_variables() + grads_vars = opt.compute_gradients(loss, train_variables) + for i, (grad, var) in enumerate(grads_vars): + grads_vars[i] = (tf.clip_by_norm(grad, self.config.grad_clip), var) + train_op = opt.apply_gradients(grads_vars, global_step=self.global_step) + return train_op diff --git a/decomposable_att_model/run.sh b/decomposable_att_model/run.sh new file mode 100755 index 0000000..26df3c0 --- /dev/null +++ b/decomposable_att_model/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python decomp_att.py --train + + +echo "test model" +python decomp_att.py --test diff --git a/decomposable_att_model/utils.py b/decomposable_att_model/utils.py new file mode 100755 index 0000000..8b28136 --- /dev/null +++ b/decomposable_att_model/utils.py @@ -0,0 +1,125 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import tensorflow as tf + + +def padding(data, max_len): + return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post') + +def eval_map_mrr(qids, aids, preds, labels): + # 衡量map指标和mrr指标 + dic = dict() + pre_dic = dict() + for qid, aid, pred, label in zip(qids, aids, preds, labels): + pre_dic.setdefault(qid, []) + pre_dic[qid].append([aid, pred, label]) + for qid in pre_dic: + dic[qid] = sorted(pre_dic[qid], key=lambda k: k[1], reverse=True) + aid2rank = {aid:[label, rank] for (rank, (aid, pred, label)) in enumerate(dic[qid])} + dic[qid] = aid2rank + # correct = 0 + # total = 0 + # for qid in dic: + # cur_correct = 0 + # for aid in dic[qid]: + # if dic[qid][aid][0] == 1: + # cur_correct += 1 + # if cur_correct > 0: + # correct += 1 + # total += 1 + # print(correct * 1. / total) + + MAP = 0.0 + MRR = 0.0 + useful_q_len = 0 + for q_id in dic: + sort_rank = sorted(dic[q_id].items(), key=lambda k: k[1][1], reverse=False) + correct = 0 + total = 0 + AP = 0.0 + mrr_mark = False + for i in range(len(sort_rank)): + if sort_rank[i][1][0] == 1: + correct += 1 + if correct == 0: + continue + useful_q_len += 1 + correct = 0 + for i in range(len(sort_rank)): + # compute MRR + if sort_rank[i][1][0] == 1 and mrr_mark == False: + MRR += 1.0 / float(i + 1) + mrr_mark = True + # compute MAP + total += 1 + if sort_rank[i][1][0] == 1: + correct += 1 + AP += float(correct) / float(total) + + AP /= float(correct) + MAP += AP + + MAP /= useful_q_len + MRR /= useful_q_len + return MAP, MRR + +def build_embedding(in_file, word_dict): + # 构建预训练的embedding矩阵 + num_words = max(word_dict.values()) + 1 + dim = int(in_file.split('.')[-2][:-1]) + embeddings = np.zeros((num_words, dim)) + + if in_file is not None: + pre_trained = 0 + initialized = {} + avg_sigma = 0 + avg_mu = 0 + for line in open(in_file).readlines(): + sp = line.split() + assert len(sp) == dim + 1 + if sp[0] in word_dict: + initialized[sp[0]] = True + pre_trained += 1 + embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]] + mu = embeddings[word_dict[sp[0]]].mean() + #print embeddings[word_dict[sp[0]]] + sigma = np.std(embeddings[word_dict[sp[0]]]) + avg_mu += mu + avg_sigma += sigma + avg_sigma /= 1. * pre_trained + avg_mu /= 1. * pre_trained + for w in word_dict: + if w not in initialized: + embeddings[word_dict[w]] = np.random.normal(avg_mu, avg_sigma, (dim,)) + print('Pre-trained: %d (%.2f%%)' % + (pre_trained, pre_trained * 100.0 / num_words)) + return embeddings.astype(np.float32) + + +class Iterator(object): + """ + 数据迭代器 + """ + def __init__(self, x): + self.x = x + self.sample_num = len(self.x) + + def next_batch(self, batch_size, shuffle=True): + # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len + if shuffle: + np.random.shuffle(self.x) + l = np.random.randint(0, self.sample_num - batch_size + 1) + r = l + batch_size + x_part = self.x[l:r] + return x_part + + def next(self, batch_size, shuffle=False): + if shuffle: + np.random.shuffle(self.x) + l = 0 + while l < self.sample_num: + r = min(l + batch_size, self.sample_num) + batch_size = r - l + x_part = self.x[l:r] + l += batch_size + yield x_part diff --git a/download.sh b/download.sh new file mode 100755 index 0000000..26915c2 --- /dev/null +++ b/download.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +PWD=$(pwd) + +# Download GloVe +GLOVE_DIR=$PWD/data/embedding +mkdir -p $GLOVE_DIR +wget http://nlp.stanford.edu/data/glove.6B.300d.zip -O $GLOVE_DIR/glove.6B.300d.zip +unzip $GLOVE_DIR/glove.6B.300d.zip -d $GLOVE_DIR + +# Download Glove Character Embedding +# wget https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt -O $GLOVE_DIR/glove.840B.300d-char.txt + +# Download fasttext +# FASTTEXT_DIR=~/data/fasttext +# mkdir -p $FASTTEXT_DIR +# wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip -O $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip +# unzip $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip -d $FASTTEXT_DIR \ No newline at end of file diff --git a/code/preprocess_wiki.ipynb b/preprocess_wiki.ipynb old mode 100644 new mode 100755 similarity index 100% rename from code/preprocess_wiki.ipynb rename to preprocess_wiki.ipynb diff --git a/preprocess_wiki.py b/preprocess_wiki.py new file mode 100755 index 0000000..e7aafea --- /dev/null +++ b/preprocess_wiki.py @@ -0,0 +1,165 @@ + +# coding: utf-8 + +# In[1]: + + +import os +import sys +stdout = sys.stdout +reload(sys) +sys.stdout = stdout + +import cPickle as pkl + +from collections import Counter +from nltk import sent_tokenize, word_tokenize +from nltk.corpus import stopwords, wordnet +from nltk.stem import WordNetLemmatizer +import jieba +# jieba.enable_parallel(8) +lemma = WordNetLemmatizer() + +raw_data_path = './data/WikiQA/raw' +processed_data_path = './data/WikiQA/processed' + +if not os.path.exists(processed_data_path): + os.mkdir(processed_data_path) + + +# In[8]: + + +# 分词、词干化处理 +def segment(filename, use_lemma=True): + processed_qa = [] + count = 0 + with open(os.path.join(raw_data_path, filename), 'r') as fr: + fr.readline() + for line in fr: + items = line.strip().split('\t') + qid, q, aid, a, label = items[0], items[1], items[4], items[5], items[6] + if use_lemma: + q = ' '.join([lemma.lemmatize(_) for _ in jieba.cut(q)]).lower() + a = ' '.join([lemma.lemmatize(_) for _ in jieba.cut(a)]).lower() + else: + q = ' '.join(jieba.cut(q)).lower() + q = ' '.join(jieba.cut(a)).lower() + processed_qa.append('\t'.join([qid, q, aid, a, label])) + count += 1 + if count % 1000 == 0: + print('Finished {}'.format(count)) + return processed_qa + +# 构建词典 +def build_vocab(corpus, topk=None): + vocab = Counter() + for line in corpus: + qid, q, aid, a, label = line.strip().split('\t') + vocab.update(q.split()) + vocab.update(a.split()) + if topk: + vocab = vocab.most_common(topk) + else: + vocab = dict(vocab.most_common()).keys() + vocab = {_ : i+2 for i, _ in enumerate(vocab)} + vocab[''] = 0 + vocab[''] = 1 + reverse_vocab = dict(zip(vocab.values(), vocab.keys())) + return vocab, reverse_vocab + +# 将每个词映射为词典中的id +def transform(corpus, word2id, unk_id=1): + transformed_corpus = [] + for line in corpus: + qid, q, aid, a, label = line.strip().split('\t') + q = [word2id.get(w, unk_id) for w in q.split()] + a = [word2id.get(w, unk_id) for w in a.split()] + transformed_corpus.append([qid, q, aid, a, int(label)]) + return transformed_corpus + +# 得到pointwise形式的数据,即(Q, A, label) +def pointwise_data(corpus, keep_ids=False): + # (q, a, label) + pointwise_corpus = [] + for sample in corpus: + qid, q, aid, a, label = sample + if keep_ids: + pointwise_corpus.append((qid, q, aid, a, label)) + else: + pointwise_corpus.append((q, a, label)) + return pointwise_corpus + +# 得到pairwise形式的数据,即(Q, positive A, negative A) +def pairwise_data(corpus): + # (q, a_pos, a_neg), two answers must from the same q + # once a question contains no positive answers, we discard this sample. + pairwise_corpus = dict() + for sample in corpus: + qid, q, aid, a, label = sample + pairwise_corpus.setdefault(qid, dict()) + pairwise_corpus[qid].setdefault('pos', list()) + pairwise_corpus[qid].setdefault('neg', list()) + pairwise_corpus[qid]['q'] = q + if label == 0: + pairwise_corpus[qid]['neg'].append(a) + else: + pairwise_corpus[qid]['pos'].append(a) + real_pairwise_corpus = [] + for qid in pairwise_corpus: + q = pairwise_corpus[qid]['q'] + for pos in pairwise_corpus[qid]['pos']: + for neg in pairwise_corpus[qid]['neg']: + real_pairwise_corpus.append((q, pos, neg)) + return real_pairwise_corpus + +# 得到listwise形式的数据,即(Q, All answers related to this Q) +def listwise_data(corpus): + # (q, a_list) + listwise_corpus = dict() + for sample in corpus: + qid, q, aid, a, label = sample + listwise_corpus.setdefault(qid, dict()) + listwise_corpus[qid].setdefault('a', list()) + listwise_corpus[qid]['q'] = q + listwise_corpus[qid]['a'].append(a) + real_listwise_corpus = [] + for qid in listwise_corpus: + q = listwise_corpus[qid]['q'] + alist = listwise_corpus[qid]['a'] + real_listwise_corpus.append((q, alist)) + return real_listwise_corpus + + +train_processed_qa = segment('WikiQA-train.tsv') +val_processed_qa = segment('WikiQA-dev.tsv') +test_processed_qa = segment('WikiQA-test.tsv') +word2id, id2word = build_vocab(train_processed_qa) + +transformed_train_corpus = transform(train_processed_qa, word2id) +pointwise_train_corpus = pointwise_data(transformed_train_corpus, keep_ids=True) +pairwise_train_corpus = pairwise_data(transformed_train_corpus) +listwise_train_corpus = listwise_data(transformed_train_corpus) + +transformed_val_corpus = transform(val_processed_qa, word2id) +pointwise_val_corpus = pointwise_data(transformed_val_corpus, keep_ids=True) +pairwise_val_corpus = pointwise_data(transformed_val_corpus, keep_ids=True) +listwise_val_corpus = listwise_data(transformed_val_corpus) + +transformed_test_corpus = transform(test_processed_qa, word2id) +pointwise_test_corpus = pointwise_data(transformed_test_corpus, keep_ids=True) +pairwise_test_corpus = pointwise_data(transformed_test_corpus, keep_ids=True) +listwise_test_corpus = listwise_data(transformed_test_corpus) + + +with open(os.path.join(processed_data_path, 'vocab.pkl'), 'w') as fw: + pkl.dump([word2id, id2word], fw) +with open(os.path.join(processed_data_path, 'pointwise_corpus.pkl'), 'w') as fw: + pkl.dump([pointwise_train_corpus, pointwise_val_corpus, pointwise_test_corpus], fw) +with open(os.path.join(processed_data_path, 'pairwise_corpus.pkl'), 'w') as fw: + pkl.dump([pairwise_train_corpus, pairwise_val_corpus, pairwise_test_corpus], fw) +with open(os.path.join(processed_data_path, 'listwise_corpus.pkl'), 'w') as fw: + pkl.dump([listwise_train_corpus, listwise_val_corpus, listwise_test_corpus], fw) + +print('done!') + diff --git a/qacnn/._models.py b/qacnn/._models.py new file mode 100755 index 0000000..772354b Binary files /dev/null and b/qacnn/._models.py differ diff --git a/qacnn/README.me b/qacnn/README.me new file mode 100755 index 0000000..95b06ab --- /dev/null +++ b/qacnn/README.me @@ -0,0 +1,23 @@ +# 使用pairwise形式的QACNN网络实现问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/qacnn/models.py b/qacnn/models.py new file mode 100755 index 0000000..0be58b7 --- /dev/null +++ b/qacnn/models.py @@ -0,0 +1,143 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +import numpy as np + +class QACNN(object): + """ + pairwise学习模型 + """ + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, aplus_embed, aminus_embed = self.add_embeddings() + # [batch_size, sequence_size, hidden_size, 1] + self.h_q, self.h_ap, self.h_am = self.add_hl(q_embed, aplus_embed, aminus_embed) + # [batch_size, total_channels] + real_pool_q, real_pool_ap, real_pool_am = self.add_model(q_embed, aplus_embed, aminus_embed) + # [batch_size, 1] + self.q_ap_cosine, self.q_am_cosine = self.calc_cosine(real_pool_q, real_pool_ap, real_pool_am) + # 损失和精确度 + self.total_loss, self.loss, self.accu = self.add_loss_op(self.q_ap_cosine, self.q_am_cosine) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + + # 输入 + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 正向回答 + self.aplus = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='PosAns') + # 负向回答 + self.aminus = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='NegAns') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + # word embeddings + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + aplus_embed = tf.nn.embedding_lookup(embeddings, self.aplus) + aminus_embed = tf.nn.embedding_lookup(embeddings, self.aminus) + q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) + aplus_embed = tf.nn.dropout(aplus_embed, keep_prob=self.keep_prob) + aminus_embed = tf.nn.dropout(aminus_embed, keep_prob=self.keep_prob) + return q_embed, aplus_embed, aminus_embed + + # Hidden Layer + def add_hl(self, q_embed, aplus_embed, aminus_embed): + with tf.variable_scope('HL'): + W = tf.get_variable('weights', shape=[self.config.embedding_size, self.config.hidden_size], initializer=tf.uniform_unit_scaling_initializer()) + b = tf.get_variable('biases', initializer=tf.constant(0.1, shape=[self.config.hidden_size])) + h_q = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(q_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.max_q_length, self.config.hidden_size]) + h_ap = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aplus_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.max_a_length, self.config.hidden_size]) + h_am = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aminus_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.max_a_length, self.config.hidden_size]) + tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(W)) + return h_q, h_ap, h_am + + # CNN层 + def add_model(self, h_q, h_ap, h_am): + pool_q = list() + pool_ap = list() + pool_am = list() + h_q = tf.reshape(h_q, [-1, self.config.max_q_length, self.config.embedding_size, 1]) + h_ap = tf.reshape(h_ap, [-1, self.config.max_a_length, self.config.embedding_size, 1]) + h_am = tf.reshape(h_am, [-1, self.config.max_a_length, self.config.embedding_size, 1]) + for i, filter_size in enumerate(self.config.filter_sizes): + with tf.variable_scope('filter{}'.format(filter_size)): + conv1_W = tf.get_variable('W_q', shape=[filter_size, self.config.embedding_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .1)) + conv2_W = tf.get_variable('W_a', shape=[filter_size, self.config.embedding_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .1)) + conv1_b = tf.get_variable('conv_qb', initializer=tf.constant(0.1, shape=[self.config.num_filters])) + conv2_b = tf.get_variable('conv_ab', initializer=tf.constant(0.1, shape=[self.config.num_filters])) + # pooling层的bias,Q和A分开 + pool_qb = tf.get_variable('pool_qb', initializer=tf.constant(0.1, shape=[self.config.num_filters])) + pool_ab = tf.get_variable('pool_ab', initializer=tf.constant(0.1, shape=[self.config.num_filters])) + # 卷积 + out_q = tf.nn.relu((tf.nn.conv2d(h_q, conv1_W, [1,1,1,1], padding='VALID')+conv1_b)) + # 池化 + out_q = tf.nn.max_pool(out_q, [1,self.config.max_q_length-filter_size+1,1,1], [1,1,1,1], padding='VALID') + out_q = tf.nn.tanh(out_q+pool_qb) + pool_q.append(out_q) + + out_ap = tf.nn.relu((tf.nn.conv2d(h_ap, conv2_W, [1,1,1,1], padding='VALID')+conv2_b)) + out_ap = tf.nn.max_pool(out_ap, [1,self.config.max_a_length-filter_size+1,1,1], [1,1,1,1], padding='VALID') + out_ap = tf.nn.tanh(out_ap+pool_ab) + pool_ap.append(out_ap) + + out_am = tf.nn.relu((tf.nn.conv2d(h_am, conv2_W, [1,1,1,1], padding='VALID')+conv2_b)) + out_am = tf.nn.max_pool(out_am, [1,self.config.max_a_length-filter_size+1,1,1], [1,1,1,1], padding='VALID') + out_am = tf.nn.tanh(out_am+pool_ab) + pool_am.append(out_am) + + # 加入正则项 + tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(conv1_W)) + tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(conv2_W)) + + total_channels = len(self.config.filter_sizes)*self.config.num_filters + real_pool_q = tf.reshape(tf.concat(pool_q, 3), [-1, total_channels]) + real_pool_ap = tf.reshape(tf.concat(pool_ap, 3), [-1, total_channels]) + real_pool_am = tf.reshape(tf.concat(pool_am, 3), [-1, total_channels]) + + return real_pool_q, real_pool_ap, real_pool_am + + # 计算cosine + def calc_cosine(self, real_pool_q, real_pool_ap, real_pool_am): + normalized_q_h_pool = tf.nn.l2_normalize(real_pool_q, dim=1) + normalized_pos_h_pool = tf.nn.l2_normalize(real_pool_ap, dim=1) + normalized_neg_h_pool = tf.nn.l2_normalize(real_pool_am, dim=1) + q_ap_cosine = tf.reduce_sum(tf.multiply(normalized_q_h_pool, normalized_pos_h_pool), 1) + q_am_cosine = tf.reduce_sum(tf.multiply(normalized_q_h_pool, normalized_neg_h_pool), 1) + + return q_ap_cosine, q_am_cosine + + # 损失节点 + def add_loss_op(self, q_ap_cosine, q_am_cosine): + original_loss = self.config.m - q_ap_cosine + q_am_cosine + l = tf.maximum(tf.zeros_like(original_loss), original_loss) + loss = tf.reduce_sum(l) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + accu = tf.reduce_mean(tf.cast(tf.equal(0., l), tf.float32)) + return total_loss, loss, accu + + # 训练节点 + def add_train_op(self, loss): + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + train_op = opt.minimize(loss, self.global_step) + return train_op \ No newline at end of file diff --git a/code/qacnn.py b/qacnn/qacnn.py similarity index 96% rename from code/qacnn.py rename to qacnn/qacnn.py index f4a4bf8..f25550c 100755 --- a/code/qacnn.py +++ b/qacnn/qacnn.py @@ -7,14 +7,13 @@ reload(sys) sys.stdout = stdout -os.environ["CUDA_VISIBLE_DEVICES"] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" import cPickle as pkl - from utils import * - from models import QACNN + class QACNNConfig(object): def __init__(self, vocab_size, embeddings=None): # 输入问题(句子)长度 @@ -49,8 +48,6 @@ def __init__(self, vocab_size, embeddings=None): self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 - - def train(train_corpus, config, val_corpus, eval_train_corpus=None): iterator = Iterator(train_corpus) @@ -73,7 +70,7 @@ def train(train_corpus, config, val_corpus, eval_train_corpus=None): count += 1 if count % 10 == 0: print('[epoch {}, batch {}]Loss:{}, Accuracy:{}'.format(epoch, count, loss, accu)) - saver.save(sess,'models/qacnn/my_model', global_step=epoch) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) if eval_train_corpus is not None: train_res = evaluate(sess, model, eval_train_corpus, config) print('[train] ' + train_res) @@ -82,7 +79,6 @@ def train(train_corpus, config, val_corpus, eval_train_corpus=None): print('[eval] ' + val_res) - def evaluate(sess, model, corpus, config): iterator = Iterator(corpus) @@ -122,7 +118,7 @@ def test(corpus, config): with tf.Session(config=config.cf) as sess: model = QACNN(config) saver = tf.train.Saver() - saver.restore(sess, tf.train.latest_checkpoint('models/qacnn')) + saver.restore(sess, tf.train.latest_checkpoint(model_path)) print('[test] ' + evaluate(sess, model, corpus, config)) @@ -178,6 +174,7 @@ def main(args): raw_data_path = '../data/WikiQA/raw' processed_data_path = '../data/WikiQA/processed' embedding_path = '../data/embedding/glove.6B.300d.txt' + model_path = 'models' with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: word2id, id2word = pkl.load(fr) diff --git a/qacnn/run.sh b/qacnn/run.sh new file mode 100755 index 0000000..10734f6 --- /dev/null +++ b/qacnn/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python qacnn.py --train + + +echo "test model" +python qacnn.py --test diff --git a/qacnn/utils.py b/qacnn/utils.py new file mode 100755 index 0000000..8b28136 --- /dev/null +++ b/qacnn/utils.py @@ -0,0 +1,125 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import tensorflow as tf + + +def padding(data, max_len): + return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post') + +def eval_map_mrr(qids, aids, preds, labels): + # 衡量map指标和mrr指标 + dic = dict() + pre_dic = dict() + for qid, aid, pred, label in zip(qids, aids, preds, labels): + pre_dic.setdefault(qid, []) + pre_dic[qid].append([aid, pred, label]) + for qid in pre_dic: + dic[qid] = sorted(pre_dic[qid], key=lambda k: k[1], reverse=True) + aid2rank = {aid:[label, rank] for (rank, (aid, pred, label)) in enumerate(dic[qid])} + dic[qid] = aid2rank + # correct = 0 + # total = 0 + # for qid in dic: + # cur_correct = 0 + # for aid in dic[qid]: + # if dic[qid][aid][0] == 1: + # cur_correct += 1 + # if cur_correct > 0: + # correct += 1 + # total += 1 + # print(correct * 1. / total) + + MAP = 0.0 + MRR = 0.0 + useful_q_len = 0 + for q_id in dic: + sort_rank = sorted(dic[q_id].items(), key=lambda k: k[1][1], reverse=False) + correct = 0 + total = 0 + AP = 0.0 + mrr_mark = False + for i in range(len(sort_rank)): + if sort_rank[i][1][0] == 1: + correct += 1 + if correct == 0: + continue + useful_q_len += 1 + correct = 0 + for i in range(len(sort_rank)): + # compute MRR + if sort_rank[i][1][0] == 1 and mrr_mark == False: + MRR += 1.0 / float(i + 1) + mrr_mark = True + # compute MAP + total += 1 + if sort_rank[i][1][0] == 1: + correct += 1 + AP += float(correct) / float(total) + + AP /= float(correct) + MAP += AP + + MAP /= useful_q_len + MRR /= useful_q_len + return MAP, MRR + +def build_embedding(in_file, word_dict): + # 构建预训练的embedding矩阵 + num_words = max(word_dict.values()) + 1 + dim = int(in_file.split('.')[-2][:-1]) + embeddings = np.zeros((num_words, dim)) + + if in_file is not None: + pre_trained = 0 + initialized = {} + avg_sigma = 0 + avg_mu = 0 + for line in open(in_file).readlines(): + sp = line.split() + assert len(sp) == dim + 1 + if sp[0] in word_dict: + initialized[sp[0]] = True + pre_trained += 1 + embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]] + mu = embeddings[word_dict[sp[0]]].mean() + #print embeddings[word_dict[sp[0]]] + sigma = np.std(embeddings[word_dict[sp[0]]]) + avg_mu += mu + avg_sigma += sigma + avg_sigma /= 1. * pre_trained + avg_mu /= 1. * pre_trained + for w in word_dict: + if w not in initialized: + embeddings[word_dict[w]] = np.random.normal(avg_mu, avg_sigma, (dim,)) + print('Pre-trained: %d (%.2f%%)' % + (pre_trained, pre_trained * 100.0 / num_words)) + return embeddings.astype(np.float32) + + +class Iterator(object): + """ + 数据迭代器 + """ + def __init__(self, x): + self.x = x + self.sample_num = len(self.x) + + def next_batch(self, batch_size, shuffle=True): + # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len + if shuffle: + np.random.shuffle(self.x) + l = np.random.randint(0, self.sample_num - batch_size + 1) + r = l + batch_size + x_part = self.x[l:r] + return x_part + + def next(self, batch_size, shuffle=False): + if shuffle: + np.random.shuffle(self.x) + l = 0 + while l < self.sample_num: + r = min(l + batch_size, self.sample_num) + batch_size = r - l + x_part = self.x[l:r] + l += batch_size + yield x_part diff --git a/seq_match_seq/README.me b/seq_match_seq/README.me new file mode 100755 index 0000000..09a0923 --- /dev/null +++ b/seq_match_seq/README.me @@ -0,0 +1,23 @@ +# 复现《A COMPARE-AGGREGATE MODEL FOR MATCHING TEXT SEQUENCES》中的模型完成问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/seq_match_seq/models.py b/seq_match_seq/models.py new file mode 100755 index 0000000..78c5687 --- /dev/null +++ b/seq_match_seq/models.py @@ -0,0 +1,181 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +import numpy as np + + +class SeqMatchSeq(object): + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, a_embed = self.add_embeddings() + # 上下文编码 + q_encode, a_encode = self.context_encoding(q_embed, a_embed) + # attention层 + h_a = self.attend(q_encode, a_encode) + # compose层 + t = self.compare(a_encode, h_a) + # aggregate层 + agg_out = self.aggregate(t) + pred = self.soft_out(agg_out) + # 预测概率分布与损失 + self.y_hat, self.total_loss = self.add_loss_op(pred) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 回答 + self.a = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='Ans') + self.y = tf.placeholder(tf.int32, shape=[None, ], name='label') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, + name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', + shape=[self.config.vocab_size, self.config.embedding_size], + initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + a_embed = tf.nn.embedding_lookup(embeddings, self.a) + return q_embed, a_embed + + def context_encoding(self, q, a): + """ + q: [batch_size, q_length, embedding_dim] + a: [batch_size, a_length, embedding_dim] + """ + with tf.variable_scope('context_encoding') as scope: + q_encode = self.proj_layer(q, 'proj_layer', reuse=None) + a_encode = self.proj_layer(a, 'proj_layer', reuse=True) + return q_encode, a_encode + + + def attend(self, q, a): + """ + q: [batch_size, q_length, represent_dim] + a: [batch_size, a_length, represent_dim] + """ + q_proj = self.mlp(q, self.config.mem_dim, 1, None, + 'att_q_proj', reuse=None) + # [batch_size, q_length, a_length] + att_inner_product = tf.matmul(q_proj, tf.transpose(a, (0, 2, 1))) + # [batch_size, a_length, q_length] + q_weights = tf.nn.softmax( + tf.transpose( + att_inner_product, (0, 2, 1)), dim=-1) + output_a = tf.matmul(q_weights, q) + return output_a + + def compare(self, a, h_a): + """ + a: [batch_size, a_length, mem_dim] + a_att: [batch_size, a_length, mem_dim] + """ + if self.config.comp_type == 'mul': + out = a * h_a + else: + raise ValueError('{} method is not implemented!'.format( + self.config.comp_type)) + + return out + + def aggregate(self, t): + """ + t: [batch_size, a_length, mem_dim] + """ + pool_t = [] + for i, filter_size in enumerate(self.config.filter_sizes): + with tf.variable_scope('filter{}'.format(filter_size)): + # 卷积 + out_t = tf.layers.Conv1D(self.config.cov_dim, + filter_size, + strides=1, + padding='valid', + activation=tf.nn.relu, name='conv')(t) + # 池化 + out_t = tf.layers.MaxPooling1D( + self.config.max_a_length - filter_size + 1, + 1, name='max_pool')(out_t) + out_t = tf.reshape(out_t, + (tf.shape(out_t)[0], out_t.get_shape().as_list()[2])) + pool_t.append(out_t) + # [batch_size, n * mem_dim] + out = tf.concat(pool_t, axis=-1) + # [batch_size, mem_dim] + out = self.mlp(out, self.config.mem_dim, 1, + tf.nn.tanh, 'pre_out', use_dropout=False, reuse=None) + return out + + def soft_out(self, x): + out = self.mlp(x, 2, 1, None, + 'soft_out', use_dropout=False, reuse=None) + return out + + def mlp(self, bottom, size, layer_num, activation, name, use_dropout=True, reuse=None): + """ + bottom: 上层输入 + size: 神经元大小 + layer_num: 神经网络层数 + name: mlp的名称 + reuse: 是否复用层 + """ + now = bottom + if use_dropout: + now = tf.nn.dropout(now, keep_prob=self.keep_prob) + for i in xrange(layer_num): + now = tf.layers.dense(now, size, + activation=activation, + name=name + '_{}'.format(i), + reuse=reuse) + return now + + def proj_layer(self, seq, name, reuse=None): + out1 = self.mlp(seq, self.config.mem_dim, 1, + tf.nn.sigmoid, name + '_sigmoid', reuse=reuse) + out2 = self.mlp(seq, self.config.mem_dim, 1, + tf.nn.tanh, name + '_tanh', reuse=reuse) + out = out1 * out2 + return out + + def add_loss_op(self, pred): + """ + 损失节点 + """ + # [batch_size, 2] + y_hat = tf.nn.softmax(pred, dim=-1) + loss = tf.reduce_mean( + tf.losses.sparse_softmax_cross_entropy(self.y, pred)) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + return y_hat, total_loss + + def add_train_op(self, loss): + """ + 训练节点 + """ + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, + name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + # train_op = opt.minimize(loss, self.global_step) + train_variables = tf.trainable_variables() + grads_vars = opt.compute_gradients(loss, train_variables) + for i, (grad, var) in enumerate(grads_vars): + grads_vars[i] = ( + tf.clip_by_norm(grad, self.config.grad_clip), var) + train_op = opt.apply_gradients( + grads_vars, global_step=self.global_step) + return train_op diff --git a/seq_match_seq/run.sh b/seq_match_seq/run.sh new file mode 100755 index 0000000..061e6ff --- /dev/null +++ b/seq_match_seq/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python seq_match_seq.py --train + + +echo "test model" +python seq_match_seq.py --test diff --git a/seq_match_seq/seq_match_seq.py b/seq_match_seq/seq_match_seq.py new file mode 100755 index 0000000..3d4549d --- /dev/null +++ b/seq_match_seq/seq_match_seq.py @@ -0,0 +1,175 @@ +# -*- encoding:utf8 -*- +import tensorflow as tf +import numpy as np +import os +import sys +from copy import deepcopy +stdout = sys.stdout +reload(sys) +sys.stdout = stdout + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +import cPickle as pkl +from utils import * +from models import SeqMatchSeq + + +class SeqMatchSeqConfig(object): + def __init__(self, vocab_size, embeddings=None): + # 输入问题(句子)长度 + self.max_q_length = 200 + # 输入答案长度 + self.max_a_length = 200 + # 循环数 + self.num_epochs = 100 + # batch大小 + self.batch_size = 128 + # 词表大小 + self.vocab_size = vocab_size + # 词向量大小 + self.embeddings = embeddings + self.embedding_size = 100 + if self.embeddings is not None: + self.embedding_size = embeddings.shape[1] + # keep_prob=1-dropout + self.keep_prob = 0.6 + # 学习率 + self.lr = 0.0003 + self.grad_clip = 1 + + self.reg = 0 + self.mem_dim = 128 + self.cov_dim = 128 + self.filter_sizes = [2, 3, 4, 5] + self.comp_type = 'mul' + + self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 + + +def train(train_corpus, config, val_corpus, eval_train_corpus=None): + iterator = Iterator(train_corpus) + + with tf.Session(config=config.cf) as sess: + model = SeqMatchSeq(config) + saver = tf.train.Saver() + sess.run(tf.initialize_all_variables()) + for epoch in xrange(config.num_epochs): + count = 0 + for batch_x in iterator.next(config.batch_size, shuffle=True): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + labels = np.asarray(labels).astype(np.int32) + _, loss = sess.run([model.train_op, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:config.keep_prob}) + count += 1 + if count % 10 == 0: + print('[epoch {}, batch {}]Loss:{}'.format(epoch, count, loss)) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) + if eval_train_corpus is not None: + train_res = evaluate(sess, model, eval_train_corpus, config) + print('[train] ' + train_res) + if val_corpus is not None: + val_res = evaluate(sess, model, val_corpus, config) + print('[eval] ' + val_res) + + +def evaluate(sess, model, corpus, config): + iterator = Iterator(corpus) + + count = 0 + total_qids = [] + total_aids = [] + total_pred = [] + total_labels = [] + total_loss = 0. + for batch_x in iterator.next(config.batch_size, shuffle=False): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + y_hat, loss = sess.run([model.y_hat, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:1.}) + y_hat = np.argmax(y_hat, axis=-1) + total_loss += loss + count += 1 + total_qids.append(batch_qids) + total_aids.append(batch_aids) + total_pred.append(y_hat) + total_labels.append(labels) + # print(batch_qids[0], [id2word[_] for _ in batch_q[0]], + # batch_aids[0], [id2word[_] for _ in batch_ap[0]]) + total_qids = np.concatenate(total_qids, axis=0) + total_aids = np.concatenate(total_aids, axis=0) + total_pred = np.concatenate(total_pred, axis=0) + total_labels = np.concatenate(total_labels, axis=0) + MAP, MRR = eval_map_mrr(total_qids, total_aids, total_pred, total_labels) + # print('Eval loss:{}'.format(total_loss / count)) + return 'MAP:{}, MRR:{}'.format(MAP, MRR) + + +def test(corpus, config): + with tf.Session(config=config.cf) as sess: + model = SeqMatchSeq(config) + saver = tf.train.Saver() + saver.restore(sess, tf.train.latest_checkpoint(model_path)) + print('[test] ' + evaluate(sess, model, corpus, config)) + + +def main(args): + max_q_length = 30 + max_a_length = 100 + + with open(os.path.join(processed_data_path, 'pointwise_corpus.pkl'), 'r') as fr: + train_corpus, val_corpus, test_corpus = pkl.load(fr) + + embeddings = build_embedding(embedding_path, word2id) + + train_qids, train_q, train_aids, train_ap, train_labels = zip(*train_corpus) + train_q = padding(train_q, max_q_length) + train_ap = padding(train_ap, max_a_length) + train_corpus = zip(train_qids, train_q, train_aids, train_ap, train_labels) + + + val_qids, val_q, val_aids, val_ap, labels = zip(*val_corpus) + val_q = padding(val_q, max_q_length) + val_ap = padding(val_ap, max_a_length) + val_corpus = zip(val_qids, val_q, val_aids, val_ap, labels) + + + test_qids, test_q, test_aids, test_ap, labels = zip(*test_corpus) + test_q = padding(test_q, max_q_length) + test_ap = padding(test_ap, max_a_length) + test_corpus = zip(test_qids, test_q, test_aids, test_ap, labels) + + config = SeqMatchSeqConfig(max(word2id.values()) + 1, embeddings=embeddings) + config.max_q_length = max_q_length + config.max_a_length = max_a_length + if args.train: + train(deepcopy(train_corpus), config, val_corpus, deepcopy(train_corpus)) + elif args.test: + test(test_corpus, config) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--train", help="whether to train", action='store_true') + parser.add_argument("--test", help="whether to test", action='store_true') + args = parser.parse_args() + + raw_data_path = '../data/WikiQA/raw' + processed_data_path = '../data/WikiQA/processed' + embedding_path = '../data/embedding/glove.6B.300d.txt' + model_path = 'models' + + with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: + word2id, id2word = pkl.load(fr) + main(args) diff --git a/seq_match_seq/utils.py b/seq_match_seq/utils.py new file mode 100755 index 0000000..8b28136 --- /dev/null +++ b/seq_match_seq/utils.py @@ -0,0 +1,125 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import tensorflow as tf + + +def padding(data, max_len): + return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post') + +def eval_map_mrr(qids, aids, preds, labels): + # 衡量map指标和mrr指标 + dic = dict() + pre_dic = dict() + for qid, aid, pred, label in zip(qids, aids, preds, labels): + pre_dic.setdefault(qid, []) + pre_dic[qid].append([aid, pred, label]) + for qid in pre_dic: + dic[qid] = sorted(pre_dic[qid], key=lambda k: k[1], reverse=True) + aid2rank = {aid:[label, rank] for (rank, (aid, pred, label)) in enumerate(dic[qid])} + dic[qid] = aid2rank + # correct = 0 + # total = 0 + # for qid in dic: + # cur_correct = 0 + # for aid in dic[qid]: + # if dic[qid][aid][0] == 1: + # cur_correct += 1 + # if cur_correct > 0: + # correct += 1 + # total += 1 + # print(correct * 1. / total) + + MAP = 0.0 + MRR = 0.0 + useful_q_len = 0 + for q_id in dic: + sort_rank = sorted(dic[q_id].items(), key=lambda k: k[1][1], reverse=False) + correct = 0 + total = 0 + AP = 0.0 + mrr_mark = False + for i in range(len(sort_rank)): + if sort_rank[i][1][0] == 1: + correct += 1 + if correct == 0: + continue + useful_q_len += 1 + correct = 0 + for i in range(len(sort_rank)): + # compute MRR + if sort_rank[i][1][0] == 1 and mrr_mark == False: + MRR += 1.0 / float(i + 1) + mrr_mark = True + # compute MAP + total += 1 + if sort_rank[i][1][0] == 1: + correct += 1 + AP += float(correct) / float(total) + + AP /= float(correct) + MAP += AP + + MAP /= useful_q_len + MRR /= useful_q_len + return MAP, MRR + +def build_embedding(in_file, word_dict): + # 构建预训练的embedding矩阵 + num_words = max(word_dict.values()) + 1 + dim = int(in_file.split('.')[-2][:-1]) + embeddings = np.zeros((num_words, dim)) + + if in_file is not None: + pre_trained = 0 + initialized = {} + avg_sigma = 0 + avg_mu = 0 + for line in open(in_file).readlines(): + sp = line.split() + assert len(sp) == dim + 1 + if sp[0] in word_dict: + initialized[sp[0]] = True + pre_trained += 1 + embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]] + mu = embeddings[word_dict[sp[0]]].mean() + #print embeddings[word_dict[sp[0]]] + sigma = np.std(embeddings[word_dict[sp[0]]]) + avg_mu += mu + avg_sigma += sigma + avg_sigma /= 1. * pre_trained + avg_mu /= 1. * pre_trained + for w in word_dict: + if w not in initialized: + embeddings[word_dict[w]] = np.random.normal(avg_mu, avg_sigma, (dim,)) + print('Pre-trained: %d (%.2f%%)' % + (pre_trained, pre_trained * 100.0 / num_words)) + return embeddings.astype(np.float32) + + +class Iterator(object): + """ + 数据迭代器 + """ + def __init__(self, x): + self.x = x + self.sample_num = len(self.x) + + def next_batch(self, batch_size, shuffle=True): + # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len + if shuffle: + np.random.shuffle(self.x) + l = np.random.randint(0, self.sample_num - batch_size + 1) + r = l + batch_size + x_part = self.x[l:r] + return x_part + + def next(self, batch_size, shuffle=False): + if shuffle: + np.random.shuffle(self.x) + l = 0 + while l < self.sample_num: + r = min(l + batch_size, self.sample_num) + batch_size = r - l + x_part = self.x[l:r] + l += batch_size + yield x_part diff --git a/siamese_cnn/README.me b/siamese_cnn/README.me new file mode 100755 index 0000000..84f28b8 --- /dev/null +++ b/siamese_cnn/README.me @@ -0,0 +1,23 @@ +# 使用CNN网络实现问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/siamese_cnn/models.py b/siamese_cnn/models.py new file mode 100755 index 0000000..b584a40 --- /dev/null +++ b/siamese_cnn/models.py @@ -0,0 +1,118 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +import numpy as np + + +class SiameseCNN(object): + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, a_embed = self.add_embeddings() + with tf.variable_scope('siamese') as scope: + self.q_trans = self.network(q_embed, reuse=False) + scope.reuse_variables() + self.a_trans = self.network(a_embed, reuse=True) + # 损失和精确度 + self.total_loss = self.add_loss_op(self.q_trans, self.a_trans) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + # 输入 + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 回答 + self.a = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='Ans') + self.y = tf.placeholder(tf.float32, shape=[None, ], name='label') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + # word embeddings + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + a_embed = tf.nn.embedding_lookup(embeddings, self.a) + q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) + a_embed = tf.nn.dropout(a_embed, keep_prob=self.keep_prob) + return q_embed, a_embed + + def network(self, x, reuse=False): + # (batch_size, conv_size) + conv1 = self.conv_layer(x, reuse=reuse) + # (batch_size, hidden_size) + fc1 = self.fc_layer(conv1, self.config.hidden_size, "fc1") + ac1 = tf.nn.relu(fc1) + # (batch_size, output_size) + fc2 = self.fc_layer(ac1, self.config.output_size, "fc2") + return fc2 + + def fc_layer(self, bottom, n_weight, name): + assert len(bottom.get_shape()) == 2 + n_prev_weight = bottom.get_shape()[1] + initer = tf.truncated_normal_initializer(stddev=0.01) + W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) + b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.0, shape=[n_weight], dtype=tf.float32)) + fc = tf.nn.bias_add(tf.matmul(bottom, W), b) + return fc + + def conv_layer(self, h, reuse=False): + pool = list() + max_len = h.get_shape()[1] + h = tf.reshape(h, [-1, max_len, h.get_shape()[2], 1]) + for i, filter_size in enumerate(self.config.filter_sizes): + with tf.variable_scope('filter{}'.format(filter_size)): + conv1_W = tf.get_variable('conv_W', shape=[filter_size, self.config.embedding_size, 1, self.config.num_filters], initializer=tf.truncated_normal_initializer(.0, .01)) + conv1_b = tf.get_variable('conv_b', initializer=tf.constant(0.0, shape=[self.config.num_filters])) + # pooling层的bias,Q和A分开 + pool_b = tf.get_variable('pool_b', initializer=tf.constant(0.0, shape=[self.config.num_filters])) + # 卷积 + out = tf.nn.relu((tf.nn.conv2d(h, conv1_W, [1,1,1,1], padding='VALID')+conv1_b)) + # 池化 + out = tf.nn.max_pool(out, [1,max_len-filter_size+1,1,1], [1,1,1,1], padding='VALID') + out = tf.nn.tanh(out+pool_b) + pool.append(out) + # 加入正则项 + if not reuse: + tf.add_to_collection('total_loss', 0.5 * self.config.l2_reg_lambda * tf.nn.l2_loss(conv1_W)) + + total_channels = len(self.config.filter_sizes) * self.config.num_filters + real_pool = tf.reshape(tf.concat(pool, 3), [self.batch_size, total_channels]) + return real_pool + + # 损失节点 + def add_loss_op(self, o1, o2): + # 此处用cos距离 + norm_o1 = tf.nn.l2_normalize(o1, dim=1) + norm_o2 = tf.nn.l2_normalize(o2, dim=1) + self.q_a_cosine = tf.reduce_sum(tf.multiply(o1, o2), 1) + + loss = self.contrastive_loss(self.q_a_cosine, self.y) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + return total_loss + + def contrastive_loss(self, Ew, y): + l_1 = self.config.pos_weight * tf.square(1 - Ew) + l_0 = tf.square(tf.maximum(Ew, 0)) + loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) + return loss + + # 训练节点 + def add_train_op(self, loss): + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + train_op = opt.minimize(loss, self.global_step) + return train_op diff --git a/siamese_cnn/run.sh b/siamese_cnn/run.sh new file mode 100755 index 0000000..535c05c --- /dev/null +++ b/siamese_cnn/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python siamese_cnn.py --train + + +echo "test model" +python siamese_cnn.py --test diff --git a/code/siamese.py b/siamese_cnn/siamese_cnn.py similarity index 65% rename from code/siamese.py rename to siamese_cnn/siamese_cnn.py index 9626531..1c3dfe0 100755 --- a/code/siamese.py +++ b/siamese_cnn/siamese_cnn.py @@ -8,41 +8,11 @@ reload(sys) sys.stdout = stdout -os.environ["CUDA_VISIBLE_DEVICES"] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" import cPickle as pkl - from utils import * - -from models import SiameseNN, SiameseCNN, SiameseRNN - -class NNConfig(object): - def __init__(self, vocab_size, embeddings=None): - # 输入问题(句子)长度 - self.max_q_length = 200 - # 输入答案长度 - self.max_a_length = 200 - # 循环数 - self.num_epochs = 100 - # batch大小 - self.batch_size = 128 - # 词表大小 - self.vocab_size = vocab_size - self.hidden_size = 256 - self.output_size = 128 - self.keep_prob = 0.6 - # 词向量大小 - self.embeddings = embeddings - self.embedding_size = 100 - if self.embeddings is not None: - self.embedding_size = embeddings.shape[1] - # 学习率 - self.lr = 0.001 - # contrasive loss 中的 positive loss部分的权重 - self.pos_weight = 0.25 - - self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) - self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 +from models import SiameseCNN class CNNConfig(object): @@ -80,52 +50,12 @@ def __init__(self, vocab_size, embeddings=None): self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 -class RNNConfig(object): - def __init__(self, vocab_size, embeddings=None): - # 输入问题(句子)长度 - self.max_q_length = 200 - # 输入答案长度 - self.max_a_length = 200 - # 循环数 - self.num_epochs = 100 - # batch大小 - self.batch_size = 128 - # 词表大小 - self.vocab_size = vocab_size - # 词向量大小 - self.embeddings = embeddings - self.embedding_size = 100 - if self.embeddings is not None: - self.embedding_size = embeddings.shape[1] - # RNN单元类型和大小与堆叠层数 - self.cell_type = 'GRU' - self.rnn_size = 128 - self.layer_size = 2 - # 隐层大小 - self.hidden_size = 256 - self.output_size = 128 - # 每种filter的数量 - self.num_filters = 128 - self.keep_prob = 0.6 - # 学习率 - self.lr = 0.001 - # contrasive loss 中的 positive loss部分的权重 - self.pos_weight = 0.5 - - self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) - self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 - def train(train_corpus, config, val_corpus, eval_train_corpus=None): iterator = Iterator(train_corpus) with tf.Session(config=config.cf) as sess: - if args.model == 'NN': - model = SiameseNN(config) - elif args.model == 'CNN': - model = SiameseCNN(config) - else: - model = SiameseRNN(config) + model = SiameseCNN(config) saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) for epoch in xrange(config.num_epochs): @@ -142,7 +72,7 @@ def train(train_corpus, config, val_corpus, eval_train_corpus=None): count += 1 if count % 10 == 0: print('[epoch {}, batch {}]Loss:{}'.format(epoch, count, loss)) - saver.save(sess,'models/siamese_{}/my_model'.format(args.model.lower()), global_step=epoch) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) if eval_train_corpus is not None: train_res = evaluate(sess, model, eval_train_corpus, config) print('[train] ' + train_res) @@ -188,14 +118,9 @@ def evaluate(sess, model, corpus, config): def test(corpus, config): with tf.Session(config=config.cf) as sess: - if args.model == 'NN': - model = SiameseNN(config) - elif args.model == 'CNN': - model = SiameseCNN(config) - else: - model = SiameseRNN(config) + model = SiameseCNN(config) saver = tf.train.Saver() - saver.restore(sess, tf.train.latest_checkpoint('models/siamese_{}'.format(args.model))) + saver.restore(sess, tf.train.latest_checkpoint(model_path)) print('[test] ' + evaluate(sess, model, corpus, config)) @@ -225,12 +150,7 @@ def main(args): test_ap = padding(test_ap, max_a_length) test_corpus = zip(test_qids, test_q, test_aids, test_ap, labels) - if args.model == 'NN': - config = NNConfig(max(word2id.values()) + 1, embeddings=embeddings) - elif args.model == 'CNN': - config = CNNConfig(max(word2id.values()) + 1, embeddings=embeddings) - else: - config = RNNConfig(max(word2id.values()) + 1, embeddings=embeddings) + config = CNNConfig(max(word2id.values()) + 1, embeddings=embeddings) config.max_q_length = max_q_length config.max_a_length = max_a_length if args.train: @@ -244,12 +164,12 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument("--train", help="whether to train", action='store_true') parser.add_argument("--test", help="whether to test", action='store_true') - parser.add_argument("--model", help="choose models from nn, cnn, rnn", type=str, default='NN') args = parser.parse_args() raw_data_path = '../data/WikiQA/raw' processed_data_path = '../data/WikiQA/processed' embedding_path = '../data/embedding/glove.6B.300d.txt' + model_path = 'models' with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: word2id, id2word = pkl.load(fr) diff --git a/siamese_cnn/utils.py b/siamese_cnn/utils.py new file mode 100755 index 0000000..8b28136 --- /dev/null +++ b/siamese_cnn/utils.py @@ -0,0 +1,125 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import tensorflow as tf + + +def padding(data, max_len): + return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post') + +def eval_map_mrr(qids, aids, preds, labels): + # 衡量map指标和mrr指标 + dic = dict() + pre_dic = dict() + for qid, aid, pred, label in zip(qids, aids, preds, labels): + pre_dic.setdefault(qid, []) + pre_dic[qid].append([aid, pred, label]) + for qid in pre_dic: + dic[qid] = sorted(pre_dic[qid], key=lambda k: k[1], reverse=True) + aid2rank = {aid:[label, rank] for (rank, (aid, pred, label)) in enumerate(dic[qid])} + dic[qid] = aid2rank + # correct = 0 + # total = 0 + # for qid in dic: + # cur_correct = 0 + # for aid in dic[qid]: + # if dic[qid][aid][0] == 1: + # cur_correct += 1 + # if cur_correct > 0: + # correct += 1 + # total += 1 + # print(correct * 1. / total) + + MAP = 0.0 + MRR = 0.0 + useful_q_len = 0 + for q_id in dic: + sort_rank = sorted(dic[q_id].items(), key=lambda k: k[1][1], reverse=False) + correct = 0 + total = 0 + AP = 0.0 + mrr_mark = False + for i in range(len(sort_rank)): + if sort_rank[i][1][0] == 1: + correct += 1 + if correct == 0: + continue + useful_q_len += 1 + correct = 0 + for i in range(len(sort_rank)): + # compute MRR + if sort_rank[i][1][0] == 1 and mrr_mark == False: + MRR += 1.0 / float(i + 1) + mrr_mark = True + # compute MAP + total += 1 + if sort_rank[i][1][0] == 1: + correct += 1 + AP += float(correct) / float(total) + + AP /= float(correct) + MAP += AP + + MAP /= useful_q_len + MRR /= useful_q_len + return MAP, MRR + +def build_embedding(in_file, word_dict): + # 构建预训练的embedding矩阵 + num_words = max(word_dict.values()) + 1 + dim = int(in_file.split('.')[-2][:-1]) + embeddings = np.zeros((num_words, dim)) + + if in_file is not None: + pre_trained = 0 + initialized = {} + avg_sigma = 0 + avg_mu = 0 + for line in open(in_file).readlines(): + sp = line.split() + assert len(sp) == dim + 1 + if sp[0] in word_dict: + initialized[sp[0]] = True + pre_trained += 1 + embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]] + mu = embeddings[word_dict[sp[0]]].mean() + #print embeddings[word_dict[sp[0]]] + sigma = np.std(embeddings[word_dict[sp[0]]]) + avg_mu += mu + avg_sigma += sigma + avg_sigma /= 1. * pre_trained + avg_mu /= 1. * pre_trained + for w in word_dict: + if w not in initialized: + embeddings[word_dict[w]] = np.random.normal(avg_mu, avg_sigma, (dim,)) + print('Pre-trained: %d (%.2f%%)' % + (pre_trained, pre_trained * 100.0 / num_words)) + return embeddings.astype(np.float32) + + +class Iterator(object): + """ + 数据迭代器 + """ + def __init__(self, x): + self.x = x + self.sample_num = len(self.x) + + def next_batch(self, batch_size, shuffle=True): + # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len + if shuffle: + np.random.shuffle(self.x) + l = np.random.randint(0, self.sample_num - batch_size + 1) + r = l + batch_size + x_part = self.x[l:r] + return x_part + + def next(self, batch_size, shuffle=False): + if shuffle: + np.random.shuffle(self.x) + l = 0 + while l < self.sample_num: + r = min(l + batch_size, self.sample_num) + batch_size = r - l + x_part = self.x[l:r] + l += batch_size + yield x_part diff --git a/siamese_nn/._README.me b/siamese_nn/._README.me new file mode 100755 index 0000000..1cebd06 Binary files /dev/null and b/siamese_nn/._README.me differ diff --git a/siamese_nn/._run.sh b/siamese_nn/._run.sh new file mode 100755 index 0000000..77fd951 Binary files /dev/null and b/siamese_nn/._run.sh differ diff --git a/siamese_nn/README.me b/siamese_nn/README.me new file mode 100755 index 0000000..d1f5bb8 --- /dev/null +++ b/siamese_nn/README.me @@ -0,0 +1,23 @@ +# 使用NN网络实现问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/siamese_nn/models.py b/siamese_nn/models.py new file mode 100755 index 0000000..d818667 --- /dev/null +++ b/siamese_nn/models.py @@ -0,0 +1,98 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +import numpy as np + +class SiameseNN(object): + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, a_embed = self.add_embeddings() + with tf.variable_scope('siamese') as scope: + self.q_trans = self.network(q_embed) + scope.reuse_variables() + self.a_trans = self.network(a_embed) + # 损失和精确度 + self.total_loss = self.add_loss_op(self.q_trans, self.a_trans) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + # 输入 + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 回答 + self.a = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='Ans') + self.y = tf.placeholder(tf.float32, shape=[None, ], name='label') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + # word embeddings + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + a_embed = tf.nn.embedding_lookup(embeddings, self.a) + q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) + a_embed = tf.nn.dropout(a_embed, keep_prob=self.keep_prob) + return q_embed, a_embed + + def network(self, x): + # (batch_size * max_len, embed_size) + max_len = tf.shape(x)[1] + x = tf.reshape(x, (-1, x.get_shape()[-1])) + fc1 = self.fc_layer(x, self.config.hidden_size, "fc1") + ac1 = tf.nn.relu(fc1) + fc2 = self.fc_layer(ac1, self.config.hidden_size, "fc2") + ac2 = tf.nn.relu(fc2) + # (batch_size, max_len, embed_size) + ac3 = tf.reshape(ac2, (self.batch_size, max_len, ac2.get_shape()[1])) + # (batch_size, embed_size) + ac3 = tf.reduce_mean(ac3, axis=1) + fc3 = self.fc_layer(ac3, self.config.output_size, "fc3") + return fc3 + + def fc_layer(self, bottom, n_weight, name): + assert len(bottom.get_shape()) == 2 + n_prev_weight = bottom.get_shape()[1] + initer = tf.truncated_normal_initializer(stddev=0.01) + W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) + b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.01, shape=[n_weight], dtype=tf.float32)) + fc = tf.nn.bias_add(tf.matmul(bottom, W), b) + return fc + + # 损失节点 + def add_loss_op(self, o1, o2): + # 此处用cos距离 + norm_o1 = tf.nn.l2_normalize(o1, dim=1) + norm_o2 = tf.nn.l2_normalize(o2, dim=1) + self.q_a_cosine = tf.reduce_sum(tf.multiply(o1, o2), 1) + + loss = self.contrastive_loss(self.q_a_cosine, self.y) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + return total_loss + + def contrastive_loss(self, Ew, y): + l_1 = self.config.pos_weight * tf.square(1 - Ew) + l_0 = tf.square(tf.maximum(Ew, 0)) + loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) + return loss + + # 训练节点 + def add_train_op(self, loss): + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + train_op = opt.minimize(loss, self.global_step) + return train_op \ No newline at end of file diff --git a/siamese_nn/run.sh b/siamese_nn/run.sh new file mode 100755 index 0000000..40dc941 --- /dev/null +++ b/siamese_nn/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python siamese_nn.py --train + + +echo "test model" +python siamese_nn.py --test \ No newline at end of file diff --git a/siamese_nn/siamese_nn.py b/siamese_nn/siamese_nn.py new file mode 100755 index 0000000..365a3ab --- /dev/null +++ b/siamese_nn/siamese_nn.py @@ -0,0 +1,170 @@ +# -*- encoding:utf8 -*- +import tensorflow as tf +import numpy as np +import os +import sys +from copy import deepcopy +stdout = sys.stdout +reload(sys) +sys.stdout = stdout + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +import cPickle as pkl +from utils import * +from models import SiameseNN + + +class NNConfig(object): + def __init__(self, vocab_size, embeddings=None): + # 输入问题(句子)长度 + self.max_q_length = 200 + # 输入答案长度 + self.max_a_length = 200 + # 循环数 + self.num_epochs = 100 + # batch大小 + self.batch_size = 128 + # 词表大小 + self.vocab_size = vocab_size + self.hidden_size = 256 + self.output_size = 128 + self.keep_prob = 0.6 + # 词向量大小 + self.embeddings = embeddings + self.embedding_size = 100 + if self.embeddings is not None: + self.embedding_size = embeddings.shape[1] + # 学习率 + self.lr = 0.001 + # contrasive loss 中的 positive loss部分的权重 + self.pos_weight = 0.25 + + self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 + + +def train(train_corpus, config, val_corpus, eval_train_corpus=None): + iterator = Iterator(train_corpus) + if os.path.exists(model_path): + os.mkdir(model_path) + with tf.Session(config=config.cf) as sess: + model = SiameseNN(config) + saver = tf.train.Saver() + sess.run(tf.initialize_all_variables()) + for epoch in xrange(config.num_epochs): + count = 0 + for batch_x in iterator.next(config.batch_size, shuffle=True): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + _, loss = sess.run([model.train_op, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:config.keep_prob}) + count += 1 + if count % 10 == 0: + print('[epoch {}, batch {}]Loss:{}'.format(epoch, count, loss)) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) + if eval_train_corpus is not None: + train_res = evaluate(sess, model, eval_train_corpus, config) + print('[train] ' + train_res) + if val_corpus is not None: + val_res = evaluate(sess, model, val_corpus, config) + print('[eval] ' + val_res) + + +def evaluate(sess, model, corpus, config): + iterator = Iterator(corpus) + + count = 0 + total_qids = [] + total_aids = [] + total_pred = [] + total_labels = [] + total_loss = 0. + for batch_x in iterator.next(config.batch_size, shuffle=False): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + q_ap_cosine, loss = sess.run([model.q_a_cosine, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:1.}) + total_loss += loss + count += 1 + total_qids.append(batch_qids) + total_aids.append(batch_aids) + total_pred.append(q_ap_cosine) + total_labels.append(labels) + # print(batch_qids[0], [id2word[_] for _ in batch_q[0]], + # batch_aids[0], [id2word[_] for _ in batch_ap[0]]) + total_qids = np.concatenate(total_qids, axis=0) + total_aids = np.concatenate(total_aids, axis=0) + total_pred = np.concatenate(total_pred, axis=0) + total_labels = np.concatenate(total_labels, axis=0) + MAP, MRR = eval_map_mrr(total_qids, total_aids, total_pred, total_labels) + # print('Eval loss:{}'.format(total_loss / count)) + return 'MAP:{}, MRR:{}'.format(MAP, MRR) + + +def test(corpus, config): + with tf.Session(config=config.cf) as sess: + model = SiameseNN(config) + saver = tf.train.Saver() + saver.restore(sess, tf.train.latest_checkpoint(model_path)) + print('[test] ' + evaluate(sess, model, corpus, config)) + + +def main(args): + max_q_length = 25 + max_a_length = 90 + + with open(os.path.join(processed_data_path, 'pointwise_corpus.pkl'), 'r') as fr: + train_corpus, val_corpus, test_corpus = pkl.load(fr) + + embeddings = build_embedding(embedding_path, word2id) + + train_qids, train_q, train_aids, train_ap, train_labels = zip(*train_corpus) + train_q = padding(train_q, max_q_length) + train_ap = padding(train_ap, max_a_length) + train_corpus = zip(train_qids, train_q, train_aids, train_ap, train_labels) + + + val_qids, val_q, val_aids, val_ap, labels = zip(*val_corpus) + val_q = padding(val_q, max_q_length) + val_ap = padding(val_ap, max_a_length) + val_corpus = zip(val_qids, val_q, val_aids, val_ap, labels) + + + test_qids, test_q, test_aids, test_ap, labels = zip(*test_corpus) + test_q = padding(test_q, max_q_length) + test_ap = padding(test_ap, max_a_length) + test_corpus = zip(test_qids, test_q, test_aids, test_ap, labels) + + config = NNConfig(max(word2id.values()) + 1, embeddings=embeddings) + config.max_q_length = max_q_length + config.max_a_length = max_a_length + if args.train: + train(deepcopy(train_corpus), config, val_corpus, deepcopy(train_corpus)) + elif args.test: + test(test_corpus, config) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--train", help="whether to train", action='store_true') + parser.add_argument("--test", help="whether to test", action='store_true') + args = parser.parse_args() + + model_path = 'models' + raw_data_path = '../data/WikiQA/raw' + processed_data_path = '../data/WikiQA/processed' + embedding_path = '../data/embedding/glove.6B.300d.txt' + + with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: + word2id, id2word = pkl.load(fr) + main(args) diff --git a/siamese_nn/utils.py b/siamese_nn/utils.py new file mode 100755 index 0000000..8b28136 --- /dev/null +++ b/siamese_nn/utils.py @@ -0,0 +1,125 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import tensorflow as tf + + +def padding(data, max_len): + return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post') + +def eval_map_mrr(qids, aids, preds, labels): + # 衡量map指标和mrr指标 + dic = dict() + pre_dic = dict() + for qid, aid, pred, label in zip(qids, aids, preds, labels): + pre_dic.setdefault(qid, []) + pre_dic[qid].append([aid, pred, label]) + for qid in pre_dic: + dic[qid] = sorted(pre_dic[qid], key=lambda k: k[1], reverse=True) + aid2rank = {aid:[label, rank] for (rank, (aid, pred, label)) in enumerate(dic[qid])} + dic[qid] = aid2rank + # correct = 0 + # total = 0 + # for qid in dic: + # cur_correct = 0 + # for aid in dic[qid]: + # if dic[qid][aid][0] == 1: + # cur_correct += 1 + # if cur_correct > 0: + # correct += 1 + # total += 1 + # print(correct * 1. / total) + + MAP = 0.0 + MRR = 0.0 + useful_q_len = 0 + for q_id in dic: + sort_rank = sorted(dic[q_id].items(), key=lambda k: k[1][1], reverse=False) + correct = 0 + total = 0 + AP = 0.0 + mrr_mark = False + for i in range(len(sort_rank)): + if sort_rank[i][1][0] == 1: + correct += 1 + if correct == 0: + continue + useful_q_len += 1 + correct = 0 + for i in range(len(sort_rank)): + # compute MRR + if sort_rank[i][1][0] == 1 and mrr_mark == False: + MRR += 1.0 / float(i + 1) + mrr_mark = True + # compute MAP + total += 1 + if sort_rank[i][1][0] == 1: + correct += 1 + AP += float(correct) / float(total) + + AP /= float(correct) + MAP += AP + + MAP /= useful_q_len + MRR /= useful_q_len + return MAP, MRR + +def build_embedding(in_file, word_dict): + # 构建预训练的embedding矩阵 + num_words = max(word_dict.values()) + 1 + dim = int(in_file.split('.')[-2][:-1]) + embeddings = np.zeros((num_words, dim)) + + if in_file is not None: + pre_trained = 0 + initialized = {} + avg_sigma = 0 + avg_mu = 0 + for line in open(in_file).readlines(): + sp = line.split() + assert len(sp) == dim + 1 + if sp[0] in word_dict: + initialized[sp[0]] = True + pre_trained += 1 + embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]] + mu = embeddings[word_dict[sp[0]]].mean() + #print embeddings[word_dict[sp[0]]] + sigma = np.std(embeddings[word_dict[sp[0]]]) + avg_mu += mu + avg_sigma += sigma + avg_sigma /= 1. * pre_trained + avg_mu /= 1. * pre_trained + for w in word_dict: + if w not in initialized: + embeddings[word_dict[w]] = np.random.normal(avg_mu, avg_sigma, (dim,)) + print('Pre-trained: %d (%.2f%%)' % + (pre_trained, pre_trained * 100.0 / num_words)) + return embeddings.astype(np.float32) + + +class Iterator(object): + """ + 数据迭代器 + """ + def __init__(self, x): + self.x = x + self.sample_num = len(self.x) + + def next_batch(self, batch_size, shuffle=True): + # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len + if shuffle: + np.random.shuffle(self.x) + l = np.random.randint(0, self.sample_num - batch_size + 1) + r = l + batch_size + x_part = self.x[l:r] + return x_part + + def next(self, batch_size, shuffle=False): + if shuffle: + np.random.shuffle(self.x) + l = 0 + while l < self.sample_num: + r = min(l + batch_size, self.sample_num) + batch_size = r - l + x_part = self.x[l:r] + l += batch_size + yield x_part diff --git a/siamese_rnn/README.me b/siamese_rnn/README.me new file mode 100755 index 0000000..0982db0 --- /dev/null +++ b/siamese_rnn/README.me @@ -0,0 +1,23 @@ +# 使用RNN网络实现问答任务 + +## 准备 + +#### 下载词向量文件[glove](../download.sh)。 + +``` +cd .. +bash download.sh +``` + +#### 预处理wiki数据 + +``` +cd .. +python preprocess_wiki.py +``` + +## 运行 + +``` +bash run.sh +``` diff --git a/siamese_rnn/models.py b/siamese_rnn/models.py new file mode 100755 index 0000000..918eb9a --- /dev/null +++ b/siamese_rnn/models.py @@ -0,0 +1,137 @@ +# -*- encoding:utf-8 -*- +import tensorflow as tf +import numpy as np + + +class SiameseRNN(object): + def __init__(self, config): + self.config = config + # 输入 + self.add_placeholders() + # [batch_size, sequence_size, embed_size] + q_embed, a_embed = self.add_embeddings() + with tf.variable_scope('siamese') as scope: + self.q_trans = self.network(q_embed) + tf.get_variable_scope().reuse_variables() + self.a_trans = self.network(a_embed) + # 损失和精确度 + self.total_loss = self.add_loss_op(self.q_trans, self.a_trans) + # 训练节点 + self.train_op = self.add_train_op(self.total_loss) + + # 输入 + def add_placeholders(self): + # 问题 + self.q = tf.placeholder(tf.int32, + shape=[None, self.config.max_q_length], + name='Question') + # 回答 + self.a = tf.placeholder(tf.int32, + shape=[None, self.config.max_a_length], + name='Ans') + self.y = tf.placeholder(tf.float32, shape=[None, ], name='label') + # drop_out + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + self.batch_size = tf.shape(self.q)[0] + + # word embeddings + def add_embeddings(self): + with tf.variable_scope('embedding'): + if self.config.embeddings is not None: + embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) + else: + embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) + q_embed = tf.nn.embedding_lookup(embeddings, self.q) + a_embed = tf.nn.embedding_lookup(embeddings, self.a) + q_embed = tf.nn.dropout(q_embed, keep_prob=self.keep_prob) + a_embed = tf.nn.dropout(a_embed, keep_prob=self.keep_prob) + return q_embed, a_embed + + def network(self, x): + sequence_length = x.get_shape()[1] + # (batch_size, time_step, embed_size) -> (time_step, batch_size, embed_size) + inputs = tf.transpose(x, [1, 0, 2]) + inputs = tf.reshape(inputs, [-1, self.config.embedding_size]) + inputs = tf.split(inputs, sequence_length, 0) + # (batch_size, rnn_output_size) + rnn1 = self.rnn_layer(inputs) + # (batch_size, hidden_size) + fc1 = self.fc_layer(rnn1, self.config.hidden_size, "fc1") + ac1 = tf.nn.relu(fc1) + # (batch_size, output_size) + fc2 = self.fc_layer(ac1, self.config.output_size, "fc2") + return fc2 + + def fc_layer(self, bottom, n_weight, name): + assert len(bottom.get_shape()) == 2 + n_prev_weight = bottom.get_shape()[1] + initer = tf.truncated_normal_initializer(stddev=0.01) + W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) + b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.01, shape=[n_weight], dtype=tf.float32)) + fc = tf.nn.bias_add(tf.matmul(bottom, W), b) + return fc + + def rnn_layer(self, h): + if self.config.cell_type == 'lstm': + birnn_fw, birnn_bw = self.bi_lstm(self.config.rnn_size, self.config.layer_size, self.config.keep_prob) + else: + birnn_fw, birnn_bw = self.bi_gru(self.config.rnn_size, self.config.layer_size, self.config.keep_prob) + outputs_x1, _, _ = tf.contrib.rnn.static_bidirectional_rnn(birnn_fw, birnn_bw, h, dtype=tf.float32) + # (time_step, batch_size, 2*rnn_size) -> (batch_size, 2*rnn_size) + output_x1 = tf.reduce_mean(outputs_x1, 0) + return output_x1 + + def bi_lstm(self, rnn_size, layer_size, keep_prob): + + # forward rnn + with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): + lstm_fw_cell_list = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)] + lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=keep_prob) + + # backward rnn + with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): + lstm_bw_cell_list = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)] + lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=keep_prob) + + return lstm_fw_cell_m, lstm_bw_cell_m + + def bi_gru(self, rnn_size, layer_size, keep_prob): + + # forward rnn + with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): + gru_fw_cell_list = [tf.contrib.rnn.GRUCell(rnn_size) for _ in xrange(layer_size)] + gru_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(gru_fw_cell_list), output_keep_prob=keep_prob) + + # backward rnn + with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): + gru_bw_cell_list = [tf.contrib.rnn.GRUCell(rnn_size) for _ in xrange(layer_size)] + gru_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(gru_bw_cell_list), output_keep_prob=keep_prob) + + return gru_fw_cell_m, gru_bw_cell_m + + # 损失节点 + def add_loss_op(self, o1, o2): + # 此处用cos距离 + norm_o1 = tf.nn.l2_normalize(o1, dim=1) + norm_o2 = tf.nn.l2_normalize(o2, dim=1) + self.q_a_cosine = tf.reduce_sum(tf.multiply(o1, o2), 1) + + loss = self.contrastive_loss(self.q_a_cosine, self.y) + tf.add_to_collection('total_loss', loss) + total_loss = tf.add_n(tf.get_collection('total_loss')) + return total_loss + + def contrastive_loss(self, Ew, y): + l_1 = self.config.pos_weight * tf.square(1 - Ew) + l_0 = tf.square(tf.maximum(Ew, 0)) + loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) + return loss + + # 训练节点 + def add_train_op(self, loss): + with tf.name_scope('train_op'): + # 记录训练步骤 + self.global_step = tf.Variable(0, name='global_step', trainable=False) + opt = tf.train.AdamOptimizer(self.config.lr) + train_op = opt.minimize(loss, self.global_step) + return train_op diff --git a/siamese_rnn/run.sh b/siamese_rnn/run.sh new file mode 100755 index 0000000..082d607 --- /dev/null +++ b/siamese_rnn/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "train model" +python siamese_rnn.py --train + + +echo "test model" +python siamese_rnn.py --test diff --git a/siamese_rnn/siamese_rnn.py b/siamese_rnn/siamese_rnn.py new file mode 100755 index 0000000..eb35259 --- /dev/null +++ b/siamese_rnn/siamese_rnn.py @@ -0,0 +1,176 @@ +# -*- encoding:utf8 -*- +import tensorflow as tf +import numpy as np +import os +import sys +from copy import deepcopy +stdout = sys.stdout +reload(sys) +sys.stdout = stdout + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +import cPickle as pkl +from utils import * +from models import SiameseRNN + + +class RNNConfig(object): + def __init__(self, vocab_size, embeddings=None): + # 输入问题(句子)长度 + self.max_q_length = 200 + # 输入答案长度 + self.max_a_length = 200 + # 循环数 + self.num_epochs = 100 + # batch大小 + self.batch_size = 128 + # 词表大小 + self.vocab_size = vocab_size + # 词向量大小 + self.embeddings = embeddings + self.embedding_size = 100 + if self.embeddings is not None: + self.embedding_size = embeddings.shape[1] + # RNN单元类型和大小与堆叠层数 + self.cell_type = 'GRU' + self.rnn_size = 128 + self.layer_size = 2 + # 隐层大小 + self.hidden_size = 256 + self.output_size = 128 + # 每种filter的数量 + self.num_filters = 128 + self.keep_prob = 0.6 + # 学习率 + self.lr = 0.001 + # contrasive loss 中的 positive loss部分的权重 + self.pos_weight = 0.5 + + self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 + + +def train(train_corpus, config, val_corpus, eval_train_corpus=None): + iterator = Iterator(train_corpus) + + with tf.Session(config=config.cf) as sess: + model = SiameseRNN(config) + saver = tf.train.Saver() + sess.run(tf.initialize_all_variables()) + for epoch in xrange(config.num_epochs): + count = 0 + for batch_x in iterator.next(config.batch_size, shuffle=True): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + _, loss = sess.run([model.train_op, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:config.keep_prob}) + count += 1 + if count % 10 == 0: + print('[epoch {}, batch {}]Loss:{}'.format(epoch, count, loss)) + saver.save(sess,'{}/my_model'.format(model_path), global_step=epoch) + if eval_train_corpus is not None: + train_res = evaluate(sess, model, eval_train_corpus, config) + print('[train] ' + train_res) + if val_corpus is not None: + val_res = evaluate(sess, model, val_corpus, config) + print('[eval] ' + val_res) + + +def evaluate(sess, model, corpus, config): + iterator = Iterator(corpus) + + count = 0 + total_qids = [] + total_aids = [] + total_pred = [] + total_labels = [] + total_loss = 0. + for batch_x in iterator.next(config.batch_size, shuffle=False): + batch_qids, batch_q, batch_aids, batch_ap, labels = zip(*batch_x) + batch_q = np.asarray(batch_q) + batch_ap = np.asarray(batch_ap) + q_ap_cosine, loss = sess.run([model.q_a_cosine, model.total_loss], + feed_dict={model.q:batch_q, + model.a:batch_ap, + model.y:labels, + model.keep_prob:1.}) + total_loss += loss + count += 1 + total_qids.append(batch_qids) + total_aids.append(batch_aids) + total_pred.append(q_ap_cosine) + total_labels.append(labels) + # print(batch_qids[0], [id2word[_] for _ in batch_q[0]], + # batch_aids[0], [id2word[_] for _ in batch_ap[0]]) + total_qids = np.concatenate(total_qids, axis=0) + total_aids = np.concatenate(total_aids, axis=0) + total_pred = np.concatenate(total_pred, axis=0) + total_labels = np.concatenate(total_labels, axis=0) + MAP, MRR = eval_map_mrr(total_qids, total_aids, total_pred, total_labels) + # print('Eval loss:{}'.format(total_loss / count)) + return 'MAP:{}, MRR:{}'.format(MAP, MRR) + + +def test(corpus, config): + with tf.Session(config=config.cf) as sess: + model = SiameseRNN(config) + saver = tf.train.Saver() + saver.restore(sess, tf.train.latest_checkpoint(model_path)) + print('[test] ' + evaluate(sess, model, corpus, config)) + + +def main(args): + max_q_length = 25 + max_a_length = 90 + + with open(os.path.join(processed_data_path, 'pointwise_corpus.pkl'), 'r') as fr: + train_corpus, val_corpus, test_corpus = pkl.load(fr) + + embeddings = build_embedding(embedding_path, word2id) + + train_qids, train_q, train_aids, train_ap, train_labels = zip(*train_corpus) + train_q = padding(train_q, max_q_length) + train_ap = padding(train_ap, max_a_length) + train_corpus = zip(train_qids, train_q, train_aids, train_ap, train_labels) + + + val_qids, val_q, val_aids, val_ap, labels = zip(*val_corpus) + val_q = padding(val_q, max_q_length) + val_ap = padding(val_ap, max_a_length) + val_corpus = zip(val_qids, val_q, val_aids, val_ap, labels) + + + test_qids, test_q, test_aids, test_ap, labels = zip(*test_corpus) + test_q = padding(test_q, max_q_length) + test_ap = padding(test_ap, max_a_length) + test_corpus = zip(test_qids, test_q, test_aids, test_ap, labels) + + config = RNNConfig(max(word2id.values()) + 1, embeddings=embeddings) + config.max_q_length = max_q_length + config.max_a_length = max_a_length + if args.train: + train(deepcopy(train_corpus), config, val_corpus, deepcopy(train_corpus)) + elif args.test: + test(test_corpus, config) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--train", help="whether to train", action='store_true') + parser.add_argument("--test", help="whether to test", action='store_true') + args = parser.parse_args() + + raw_data_path = '../data/WikiQA/raw' + processed_data_path = '../data/WikiQA/processed' + embedding_path = '../data/embedding/glove.6B.300d.txt' + model_path = 'models' + + with open(os.path.join(processed_data_path, 'vocab.pkl'), 'r') as fr: + word2id, id2word = pkl.load(fr) + main(args) diff --git a/siamese_rnn/utils.py b/siamese_rnn/utils.py new file mode 100755 index 0000000..8b28136 --- /dev/null +++ b/siamese_rnn/utils.py @@ -0,0 +1,125 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import tensorflow as tf + + +def padding(data, max_len): + return tf.keras.preprocessing.sequence.pad_sequences(data, max_len, padding='post', truncating='post') + +def eval_map_mrr(qids, aids, preds, labels): + # 衡量map指标和mrr指标 + dic = dict() + pre_dic = dict() + for qid, aid, pred, label in zip(qids, aids, preds, labels): + pre_dic.setdefault(qid, []) + pre_dic[qid].append([aid, pred, label]) + for qid in pre_dic: + dic[qid] = sorted(pre_dic[qid], key=lambda k: k[1], reverse=True) + aid2rank = {aid:[label, rank] for (rank, (aid, pred, label)) in enumerate(dic[qid])} + dic[qid] = aid2rank + # correct = 0 + # total = 0 + # for qid in dic: + # cur_correct = 0 + # for aid in dic[qid]: + # if dic[qid][aid][0] == 1: + # cur_correct += 1 + # if cur_correct > 0: + # correct += 1 + # total += 1 + # print(correct * 1. / total) + + MAP = 0.0 + MRR = 0.0 + useful_q_len = 0 + for q_id in dic: + sort_rank = sorted(dic[q_id].items(), key=lambda k: k[1][1], reverse=False) + correct = 0 + total = 0 + AP = 0.0 + mrr_mark = False + for i in range(len(sort_rank)): + if sort_rank[i][1][0] == 1: + correct += 1 + if correct == 0: + continue + useful_q_len += 1 + correct = 0 + for i in range(len(sort_rank)): + # compute MRR + if sort_rank[i][1][0] == 1 and mrr_mark == False: + MRR += 1.0 / float(i + 1) + mrr_mark = True + # compute MAP + total += 1 + if sort_rank[i][1][0] == 1: + correct += 1 + AP += float(correct) / float(total) + + AP /= float(correct) + MAP += AP + + MAP /= useful_q_len + MRR /= useful_q_len + return MAP, MRR + +def build_embedding(in_file, word_dict): + # 构建预训练的embedding矩阵 + num_words = max(word_dict.values()) + 1 + dim = int(in_file.split('.')[-2][:-1]) + embeddings = np.zeros((num_words, dim)) + + if in_file is not None: + pre_trained = 0 + initialized = {} + avg_sigma = 0 + avg_mu = 0 + for line in open(in_file).readlines(): + sp = line.split() + assert len(sp) == dim + 1 + if sp[0] in word_dict: + initialized[sp[0]] = True + pre_trained += 1 + embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]] + mu = embeddings[word_dict[sp[0]]].mean() + #print embeddings[word_dict[sp[0]]] + sigma = np.std(embeddings[word_dict[sp[0]]]) + avg_mu += mu + avg_sigma += sigma + avg_sigma /= 1. * pre_trained + avg_mu /= 1. * pre_trained + for w in word_dict: + if w not in initialized: + embeddings[word_dict[w]] = np.random.normal(avg_mu, avg_sigma, (dim,)) + print('Pre-trained: %d (%.2f%%)' % + (pre_trained, pre_trained * 100.0 / num_words)) + return embeddings.astype(np.float32) + + +class Iterator(object): + """ + 数据迭代器 + """ + def __init__(self, x): + self.x = x + self.sample_num = len(self.x) + + def next_batch(self, batch_size, shuffle=True): + # produce X, Y_out, Y_in, X_len, Y_in_len, Y_out_len + if shuffle: + np.random.shuffle(self.x) + l = np.random.randint(0, self.sample_num - batch_size + 1) + r = l + batch_size + x_part = self.x[l:r] + return x_part + + def next(self, batch_size, shuffle=False): + if shuffle: + np.random.shuffle(self.x) + l = 0 + while l < self.sample_num: + r = min(l + batch_size, self.sample_num) + batch_size = r - l + x_part = self.x[l:r] + l += batch_size + yield x_part