# Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling

## 模型介绍

![](https://github.com/applenob/RNN-for-Joint-NLU/raw/master/res/arc.png)

形式化表达整理：

- 输入序列：$x = (x_1,...x_T)$
- 输出序列：$y = (y_1,...y_T)$，长度和$x$相同。
- Encoder：时刻i，
- 隐藏状态：$h_i = [fh_i, bh_i]$，前向状态+后向状态。
- Decoder：时刻i，
- 状态：$s_i$，$s_i = f(s_{i-1}, y_{i-1}, h_i, c_i)$
- 其中，context向量：$c_i$，$c_i = \sum^{T}_{j=1}\alpha_{i,j}h_j$
- attention参数：$\alpha_{i,j} = \frac{exp(e_{i,j})}{\sum^T_{k=1}exp(e_{i,k})}$
- $e_{i,k} = g(s_{i-1}, h_k)$
- $g$是一个小型神经网络。

In [None]:
import tensorflow as tf
import random
import numpy as np
import json
import numpy.ma as ma
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
# from data import *
# from my_metrics import *

* http://www.isca-speech.org/archive/Interspeech_2016/pdfs/1352.PDF
* https://arxiv.org/pdf/1409.0473.pdf

## 数据预处理

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]  # 二维展成一维
index_seq2slot = lambda s, index2slot: [index2slot[i] for i in s]
index_seq2word = lambda s, index2word: [index2word[i] for i in s]
train_data = open("../input/iob-tagged-frames-dataset2/train2_w_i.iob", "r").readlines()
test_data = open("../input/iob-tagged-frames-dataset2/train2_w_i.iob", "r").readlines()
def data_pipeline(data, length=50):
    data = [t[:-1] for t in data]  # 去掉'\n'
    # 数据的一行像这样：'BOS i want to fly from baltimore to dallas round trip EOS
    # \tO O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip atis_flight'
    # 分割成这样[原始句子的词，标注的序列，intent]
    data = [[t.split("\t")[0].split(" "), t.split("\t")[1].split(" ")[:-1], t.split("\t")[1].split(" ")[-1]] for t in
            data]
    data = [[t[0][1:-1], t[1][1:], t[2]] for t in data]  # 将BOS和EOS去掉，并去掉对应标注序列中相应的标注
    seq_in, seq_out, intent = list(zip(*data))
    sin = []
    sout = []
    # padding，原始序列和标注序列结尾+<EOS>+n×<PAD>
    for i in range(len(seq_in)):
        temp = seq_in[i]
        if len(temp) < length:
            temp.append('<EOS>')
            while len(temp) < length:
                temp.append('<PAD>')
        else:
            temp = temp[:length]
            temp[-1] = '<EOS>'
        sin.append(temp)

        temp = seq_out[i]
        if len(temp) < length:
            while len(temp) < length:
                temp.append('<PAD>')
        else:
            temp = temp[:length]
            temp[-1] = '<EOS>'
        sout.append(temp)
        data = list(zip(sin, sout, intent))
    return data
train_data_ed = data_pipeline(train_data)
test_data_ed = data_pipeline(test_data)


In [None]:
print(train_data_ed[0])

每行的训练数据是：[加padding后的输入，长度，加padding的标注，intent]

In [None]:
def get_info_from_training_data(data):
    seq_in, seq_out, intent = list(zip(*data))
    vocab = set(flatten(seq_in))
    slot_tag = set(flatten(seq_out))
    intent_tag = set(intent)
    with open('iob ontology2.json', 'w') as outfile: #by jose
        json.dump(seq_in, outfile)
    with open('iob ontology3.json', 'w') as outfile: #by jose
        json.dump(seq_out, outfile)
    with open('iob ontology4.json', 'w') as outfile: #by jose
        json.dump(intent, outfile)
    # 生成word2index
    word2index = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
    for token in vocab:
        if token not in word2index.keys():
            word2index[token] = len(word2index)

    # 生成index2word
    index2word = {v: k for k, v in word2index.items()}

    # 生成tag2index
    tag2index = {'<PAD>': 0, '<UNK>': 1, "O": 2}
    for tag in slot_tag:
        if tag not in tag2index.keys():
            tag2index[tag] = len(tag2index)

    # 生成index2tag
    index2tag = {v: k for k, v in tag2index.items()}

    # 生成intent2index
    intent2index = {'<UNK>': 0}
    for ii in intent_tag:
        if ii not in intent2index.keys():
            intent2index[ii] = len(intent2index)

    # 生成index2intent
    index2intent = {v: k for k, v in intent2index.items()}
    return word2index, index2word, tag2index, index2tag, intent2index, index2intent

word2index, index2word, slot2index, index2slot, intent2index, index2intent = \
        get_info_from_training_data(train_data_ed)



In [None]:
def to_index(train, word2index, slot2index, intent2index):
    new_train = []
    for sin, sout, intent in train:
        sin_ix = list(map(lambda i: word2index[i] if i in word2index else word2index["<UNK>"],
                          sin))
        true_length = sin.index("<EOS>")
        sout_ix = list(map(lambda i: slot2index[i] if i in slot2index else slot2index["<UNK>"],
                           sout))
        intent_ix = intent2index[intent] if intent in intent2index else intent2index["<UNK>"]
        new_train.append([sin_ix, true_length, sout_ix, intent_ix])
    return new_train
index_train = to_index(train_data_ed, word2index, slot2index, intent2index)
index_test = to_index(test_data_ed, word2index, slot2index, intent2index)


In [None]:
print(index_train[0])

In [None]:
input_steps = 50
embedding_size = 64
hidden_size = 100
n_layers = 2
batch_size = 16
vocab_size = 10000
slot_size = 122
intent_size = 22
epoch_num = 5

## Modeling

模型实现。

### Tensorflow的动态rnn

`tf.nn.rnn creates an unrolled graph for a fixed RNN length. That means, if you call tf.nn.rnn with inputs having 200 time steps you are creating a static graph with 200 RNN steps. First, graph creation is slow. Second, you’re unable to pass in longer sequences (> 200) than you’ve originally specified.tf.nn.dynamic_rnn solves this. It uses a tf.While loop to dynamically construct the graph when it is executed. That means graph creation is faster and you can feed batches of variable size.`

摘自[Whats the difference between tensorflow dynamic_rnn and rnn?](https://stackoverflow.com/questions/39734146/whats-the-difference-between-tensorflow-dynamic-rnn-and-rnn)。也就是说，静态的rnn必须提前将图展开，在执行的时候，图是固定的，并且最大长度有限制。而动态rnn可以在执行的时候，将图循环地的复用。


In [None]:
encoder_inputs = tf.placeholder(tf.int32, [input_steps, batch_size],
                                             name='encoder_inputs')
# 每句输入的实际长度，除了padding
encoder_inputs_actual_length = tf.placeholder(tf.int32, [batch_size],
                                                   name='encoder_inputs_actual_length')
decoder_targets = tf.placeholder(tf.int32, [batch_size, input_steps],
                                      name='decoder_targets')
intent_targets = tf.placeholder(tf.int32, [batch_size],
                                     name='intent_targets')

### embedding

In [None]:
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size],
                                                        -0.1, 0.1), dtype=tf.float32, name="embedding")

encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)

In [None]:
encoder_inputs_embedded

## Encoder

In [None]:
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple

In [None]:
# 使用单个LSTM cell
encoder_f_cell = LSTMCell(hidden_size)
encoder_b_cell = LSTMCell(hidden_size)

In [None]:
# 下面四个变量的尺寸：T*B*D，T*B*D，B*D，B*D
(encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state) = \
    tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_f_cell,
                                    cell_bw=encoder_b_cell,
                                    inputs=encoder_inputs_embedded,
                                    sequence_length=encoder_inputs_actual_length,
                                    dtype=tf.float32, time_major=True)
encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

encoder_final_state_c = tf.concat(
    (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

encoder_final_state_h = tf.concat(
    (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

encoder_final_state = LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

In [None]:
print("encoder_outputs: ", encoder_outputs)
print("encoder_outputs[0]: ", encoder_outputs[0])
print("encoder_final_state_c: ", encoder_final_state_c)

## Decoder

In [None]:
decoder_lengths = encoder_inputs_actual_length

In [None]:
slot_W = tf.Variable(tf.random_uniform([hidden_size * 2, slot_size], -1, 1),
                             dtype=tf.float32, name="slot_W")
slot_b = tf.Variable(tf.zeros([slot_size]), dtype=tf.float32, name="slot_b")
intent_W = tf.Variable(tf.random_uniform([hidden_size * 2, intent_size], -0.1, 0.1),
                               dtype=tf.float32, name="intent_W")
intent_b = tf.Variable(tf.zeros([intent_size]), dtype=tf.float32, name="intent_b")

In [None]:
# 求intent
intent_logits = tf.add(tf.matmul(encoder_final_state_h, intent_W), intent_b)
intent = tf.argmax(intent_logits, axis=1)

In [None]:
sos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='SOS') * 2
sos_step_embedded = tf.nn.embedding_lookup(embeddings, sos_time_slice)
pad_step_embedded = tf.zeros([batch_size, hidden_size * 2 + embedding_size],
                             dtype=tf.float32)

### 开始Hack

像上面Encoder使用的那样，标准的`tf.nn.dynamic_rnn`需要提前将所有的输入都提前包装到一个tensor里传过去。

当Decoder需要使用上一个时间节点的输出时，这就不可能提前包装好。即标准的动态rnn相当于：$s_i = f(s_{i-1}, x_i)$；但如果这个函数的参数需要扩充，比如我们做的：$s_i = f(s_{i-1}, y_{i-1}, h_i, c_i)$。

于是我们需要Hack：使用`tf.contrib.seq2seq.CustomHelper`，传入三个函数：

- `initial_fn()`：第一个时间点的输入。
- `sample_fn()`：如何从logit到确定的某个固定的类别id。
- `next_inputs_fn()`：确定一般的时间点的输入。


In [None]:
def initial_fn():
    initial_elements_finished = (0 >= decoder_lengths)  # all False at the initial step
    initial_input = tf.concat((sos_step_embedded, encoder_outputs[0]), 1)
    return initial_elements_finished, initial_input

In [None]:
def sample_fn(time, outputs, state):
    # 选择logit最大的下标作为sample
    prediction_id = tf.to_int32(tf.argmax(outputs, axis=1))
    return prediction_id

In [None]:
def next_inputs_fn(time, outputs, state, sample_ids):
    # 上一个时间节点上的输出类别，获取embedding再作为下一个时间节点的输入
    pred_embedding = tf.nn.embedding_lookup(embeddings, sample_ids)
    # 输入是h_i+o_{i-1}+c_i
    next_input = tf.concat((pred_embedding, encoder_outputs[time]), 1)
    elements_finished = (time >= decoder_lengths)  # this operation produces boolean tensor of [batch_size]
    all_finished = tf.reduce_all(elements_finished)  # -> boolean scalar
    next_inputs = tf.cond(all_finished, lambda: pad_step_embedded, lambda: next_input)
    next_state = state
    return elements_finished, next_inputs, next_state

In [None]:
# 定义自己的helper
my_helper = tf.contrib.seq2seq.CustomHelper(initial_fn, sample_fn, next_inputs_fn)

In [None]:
def decode(helper, scope, reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        memory = tf.transpose(encoder_outputs, [1, 0, 2])
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units=hidden_size, memory=memory,
            memory_sequence_length=encoder_inputs_actual_length)
        cell = tf.contrib.rnn.LSTMCell(num_units=hidden_size * 2)
        attn_cell = tf.contrib.seq2seq.AttentionWrapper(
            cell, attention_mechanism, attention_layer_size=hidden_size)
        out_cell = tf.contrib.rnn.OutputProjectionWrapper(
            attn_cell, slot_size, reuse=reuse
        )
        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell=out_cell, helper=helper,
            initial_state=out_cell.zero_state(
                dtype=tf.float32, batch_size=batch_size))
        # initial_state=encoder_final_state)
        final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            decoder=decoder, output_time_major=True,
            impute_finished=True, maximum_iterations=input_steps
        )
        return final_outputs

In [None]:
outputs = decode(my_helper, 'decode')
print("outputs: ", outputs)
print("outputs.rnn_output: ", outputs.rnn_output)
print("outputs.sample_id: ", outputs.sample_id)

注意这里的输出的第一维依然是T，但已经不是之前定义的最大的50，而是当前batch的长度最大值。

In [None]:
decoder_prediction = outputs.sample_id
decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(outputs.rnn_output))
decoder_targets_time_majored = tf.transpose(decoder_targets, [1, 0])
decoder_targets_true_length = decoder_targets_time_majored[:decoder_max_steps]
print("decoder_targets_true_length: ", decoder_targets_true_length)

In [None]:
# 定义mask，使padding不计入loss计算
mask = tf.to_float(tf.not_equal(decoder_targets_true_length, 0))

In [None]:
# 定义slot标注的损失
loss_slot = tf.contrib.seq2seq.sequence_loss(
    outputs.rnn_output, decoder_targets_true_length, weights=mask)

In [None]:
# 定义intent分类的损失
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(intent_targets, depth=intent_size, dtype=tf.float32),
    logits=intent_logits)
loss_intent = tf.reduce_mean(cross_entropy)

## train

In [None]:
loss = loss_slot + loss_intent
optimizer = tf.train.AdamOptimizer(name="a_optimizer")
grads, vars = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(grads, 5)  # clip gradients
train_op = optimizer.apply_gradients(zip(grads, vars))

In [None]:
def step(sess, mode, trarin_batch):
    """ perform each batch"""
    if mode not in ['train', 'test']:
        print >> sys.stderr, 'mode is not supported'
        sys.exit(1)
    unziped = list(zip(*trarin_batch))
    if mode == 'train':
        output_feeds = [train_op, loss, decoder_prediction,
                        intent]
        feed_dict = {encoder_inputs: np.transpose(unziped[0], [1, 0]),
                     encoder_inputs_actual_length: unziped[1],
                     decoder_targets: unziped[2],
                     intent_targets: unziped[3]}
    if mode in ['test']:
        output_feeds = [decoder_prediction, intent]
        feed_dict = {encoder_inputs: np.transpose(unziped[0], [1, 0]),
                     encoder_inputs_actual_length: unziped[1]}
#     print("feed dict,output feed",feed_dict,output_feeds)
    results = sess.run(output_feeds, feed_dict=feed_dict)
#     print("results",results)
    return results

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
def accuracy_score(true_data, pred_data, true_length=None):
    true_data = np.array(true_data)
    pred_data = np.array(pred_data)
    assert true_data.shape == pred_data.shape
    if true_length is not None:
        val_num = np.sum(true_length)
        assert val_num != 0
        res = 0
        for i in range(true_data.shape[0]):
            res += np.sum(true_data[i, :true_length[i]] == pred_data[i, :true_length[i]])
    else:
        val_num = np.prod(true_data.shape)
        assert val_num != 0
        res = np.sum(true_data == pred_data)
    res /= float(val_num)
    return res
def get_data_from_sequence_batch(true_batch, pred_batch, padding_token):
    """从序列的batch中提取数据：
    [[3,1,2,0,0,0],[5,2,1,4,0,0]] -> [3,1,2,5,2,1,4]"""
    true_ma = ma.masked_equal(true_batch, padding_token)
    pred_ma = ma.masked_array(pred_batch, true_ma.mask)
    true_ma = true_ma.flatten()
    pred_ma = pred_ma.flatten()
    true_ma = true_ma[~true_ma.mask]
    pred_ma = pred_ma[~pred_ma.mask]
    return true_ma, pred_ma

def f1_for_sequence_batch(true_batch, pred_batch, average="micro", padding_token=0):
    true, pred = get_data_from_sequence_batch(true_batch, pred_batch, padding_token)
    labels = list(set(true))
    return f1_score(true, pred, labels=labels, average=average)

for epoch in range(epoch_num):
    mean_loss = 0.0
    train_loss = 0.0
    for i, batch in enumerate(getBatch(batch_size, index_train)):
        # 执行一个batch的训练
        _, loss_v, decoder_prediction_v, intent_v = step(sess, "train", batch)
        mean_loss += loss_v
        train_loss += loss_v
        if i % 30 == 0:
            if i > 0:
                mean_loss = mean_loss / 30.0
            print('Average train loss at epoch %d, step %d: %f' % (epoch, i, mean_loss))
            mean_loss = 0
    train_loss /= (i + 1)
    print("[Epoch {}] Average train loss: {}".format(epoch, train_loss))

    # 每训一个epoch，测试一次
    pred_slots = []
    for j, batch in enumerate(getBatch(batch_size, index_test)):
        decoder_prediction_v, intent_v = step(sess, "test", batch)
        decoder_prediction_v = np.transpose(decoder_prediction_v, [1, 0])
        if j == 0:
            index = random.choice(range(len(batch)))
            print("Input Sentence        : ", index_seq2word(batch[index][0], index2word))
            print("Slot Truth            : ", index_seq2slot(batch[index][2], index2slot))
            print("Slot Prediction       : ", index_seq2slot(decoder_prediction_v[index], index2slot))
            print("Intent Truth          : ", index2intent[batch[index][3]])
            print("Intent Prediction     : ", index2intent[intent_v[index]])
        slot_pred_length = list(np.shape(decoder_prediction_v))[1]
        pred_padded = np.lib.pad(decoder_prediction_v, ((0, 0), (0, input_steps-slot_pred_length)),
                                 mode="constant", constant_values=0)
        pred_slots.append(pred_padded)
        true_slot = np.array((list(zip(*batch))[2]))
        true_length = np.array((list(zip(*batch))[1]))
        true_slot = true_slot[:, :slot_pred_length]
        slot_acc = accuracy_score(true_slot, decoder_prediction_v, true_length)
        intent_acc = accuracy_score(list(zip(*batch))[3], intent_v)
        print("slot accuracy: {}, intent accuracy: {}".format(slot_acc, intent_acc))
    pred_slots_a = np.vstack(pred_slots)
    true_slots_a = np.array(list(zip(*index_test))[2])[:pred_slots_a.shape[0]]
    print("F1 score for epoch {}: {}".format(epoch, f1_for_sequence_batch(true_slots_a, pred_slots_a)))
    