![seq2seq_attention_intent_slot模型](img/img1.png)

In [None]:
'''
对话中的意图识别和槽填充联合模型：
这里实现了《Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling》上图中(c)模型
此模型利用seq2seq-attention实现：
1.意图识别是利用encoder中的最后一个time step中的双向隐层 + encoder的attention，最后接一个fc层进行分类
2.槽填充利用序列标注，基于attention的常方法，最后也是一个fc层分类
3.总的loss = 意图识别loss + 槽填充loss
'''

In [1]:
import os
from torchtext import data, datasets
import pandas as pd
import pickle

In [2]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
atis_data = os.path.join(base_dir, 'atis')

In [3]:
'''
build train and val dataset
'''
    
tokenize = lambda s:s.split()

SOURCE = data.Field(sequential=True, tokenize=tokenize,
                    lower=True, use_vocab=True,
                    init_token='<sos>', eos_token='<eos>',
                    pad_token='<pad>', unk_token='<unk>',
                    batch_first=True, fix_length=50,
                    include_lengths=True) #include_lengths=True为方便之后使用torch的pack_padded_sequence

TARGET = data.Field(sequential=True, tokenize=tokenize,
                    lower=True, use_vocab=True,
                    init_token='<sos>', eos_token='<eos>',
                    pad_token='<pad>', unk_token='<unk>',
                    batch_first=True, fix_length=50,
                    include_lengths=True) #include_lengths=True为方便之后使用torch的pack_padded_sequence
LABEL = data.Field(
                sequential=False,
                use_vocab=True)

train, val = data.TabularDataset.splits(
                                        path=atis_data,
                                        skip_header=True,
                                        train='atis.train.csv',
                                        validation='atis.test.csv',
                                        format='csv',
                                        fields=[('index', None), ('intent', LABEL), ('source', SOURCE), ('target', TARGET)])

SOURCE.build_vocab(train, val)
TARGET.build_vocab(train, val)
LABEL.build_vocab(train, val)

train_iter, val_iter = data.Iterator.splits(
                                            (train, val),
                                            batch_sizes=(128, len(val)), # 训练集设置为128,验证集整个集合用于测试
                                            shuffle=True,
                                            sort_within_batch=True, #为true则一个batch内的数据会按sort_key规则降序排序
                                            sort_key=lambda x: len(x.source)) #这里按src的长度降序排序，主要是为后面pack,pad操作)


In [4]:
# save source words
source_words_path = os.path.join(os.getcwd(), 'source_words.pkl')
with open(source_words_path, 'wb') as f_source_words:
    pickle.dump(SOURCE.vocab, f_source_words)

# save target words
target_words_path = os.path.join(os.getcwd(), 'target_words.pkl')
with open(target_words_path, 'wb') as f_target_words:
    pickle.dump(TARGET.vocab, f_target_words)
    
# save label words
label_words_path = os.path.join(os.getcwd(), 'label_words.pkl')
with open(label_words_path, 'wb') as f_label_words:
    pickle.dump(LABEL.vocab, f_label_words)


In [5]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import math
from apex import amp
import time


In [14]:
# build model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

'''
以下注意这个模型中encoder与decoder都使用n_layers=2所以在计算attention时，拿到上一步hidden的最后一层是hidden[-1,:,:]
'''
# 构建编码器
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout, pad_index):
        super(Encoder, self).__init__()
        self.pad_index = pad_index
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_index)
        self.gru = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True, batch_first=True) #使用双向
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
    def forward(self, src, src_len):
        # 初始化
        # h0 = torch.zeros(self.n_layers, src.size(1), self.hidden_dim).to(device)
        # c0 = torch.zeros(self.n_layers, src.size(1), self.hidden_dim).to(device)
        # nn.init.kaiming_normal_(h0)
        # nn.init.kaiming_normal_(c0)
        # src=[batch_size, seq_len]
        embedded = self.dropout(self.embedding(src))
        # embedd=[batch_size,seq_len,embdim]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, src_len, batch_first=True, enforce_sorted=True) #这里enfore_sotred=True要求数据根据词数排序
        output, hidden = self.gru(packed)
        # output=[batch_size, seq_len, hidden_size*2]
        # hidden=[n_layers*2, batch_size, hidden_size]
        
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True, padding_value=self.pad_index, total_length=len(src[0])) #这个会返回output以及压缩后的legnths
        
        '''
        hidden[-2,:,:]是gru最后一步的forward
        hidden[-1,:,:]是gru最后一步的backward
        利用最后前向和后向的hidden的隐状态作为decoder的初始状态
        hidden:[batch_size, hidden_dim]
        '''
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return output, hidden

# 构建attention权重计算方式
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear((hidden_dim * 2) + hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def concat_score(self, hidden, encoder_output):
        seq_len = encoder_output.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1) # [batch_size, seq_len, hidden_size]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_output),dim=2))) # [batch_size, seq_len, hidden_dim]
        attention = self.v(energy).squeeze(2) #[batch_size, seq_len]
        return attention #[batch_size, seq_len]

    def forward(self, hidden, encoder_output):
        # hidden = [batch_size, hidden_size]
        # #encoder_output=[batch_size, seq_len, hidden_dim*2]
        
        attn_energies = self.concat_score(hidden, encoder_output)

        return F.softmax(attn_energies, dim=1).unsqueeze(1) #softmax归一化，[batch_size, 1, seq_len]

# 构建解码器
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU((hidden_dim * 2) + emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        # 槽填充slot filling
        self.slot_out = nn.Linear(hidden_dim * 2 + hidden_dim, output_dim)
        self.attention = Attention(hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_output):
        input = input.unsqueeze(1)
        # input=[batch_size, 1]
        # hidden=[batch_size, hidden_size] 初始化为encoder的最后一层 [batch_size, hidden_size]
        # encoder_output=[batch_size, seq_len, hidden_dim*2]
        
        # embedded=[batch_sze, 1, emb_dim]
        embedded = self.dropout(self.embedding(input))

        # 利用利用上一步的hidden与encoder_output，计算attention权重
        # attention_weights=[batch_size, 1, seq_len]
        attention_weights = self.attention(hidden, encoder_output)

        '''
        以下是计算上下文：利用attention权重与encoder_output计算attention上下文向量
        注意力权重分布用于产生编码器隐藏状态的加权和，加权平均的过程。得到的向量称为上下文向量
        '''
        context = attention_weights.bmm(encoder_output) # [batch_size, 1, seq_len]*[batch_size,seq_len,hidden_dim*2]=[batch_size, 1, hidden_dim*2]
        
        #拼接注意力上下文和embedding向量作为gru输入
        # [batch_size, 1, hidden_dim*2+emb_dim]
        gru_input = torch.cat([context, embedded], 2)
        
        # 将注意力向量，本次embedding以及上次的hidden输入到ｇｒｕ中
        # decoder_output=[batch_size, seq_len, hidden_size]
        # hidden=[n_layers, batch_size, hidden_size]
        # decoder中的ｇｒｕ是单向，序列长度为１，层为１，
        # 所以decoder_output=[batch_size, １, hidden_size]，hidden=[１, batch_size, hidden_size]
        decoder_output, hidden = self.gru(gru_input, hidden.unsqueeze(0))
        

        decoder_output_context = torch.cat([decoder_output, context], 2) # 连接context与decoder_output的hidden_dim =[batch_size, 1, 2 * hidden_dim + hidden_dim]
        prediction = self.slot_out(decoder_output_context.squeeze(1))
        # prediction=[batch_size, output_dim]，词汇表中所有词的概率分布，这里可以使用softmax进行归一化
        return prediction, hidden.squeeze(0), attention_weights.squeeze(1), context.squeeze(1)

# 利用Encoder与Decoder构建seq2seq模型
class Seq2Seq(nn.Module):
    '''
    接收source句子
    利用编码器encoder生成上下文向量
    利用解码器decoder生成预测target句子

    每次迭代中：
    传入input以及先前的hidden与cell状态给解码器decoder
    从解码器decoder中接收一个prediction以及下一个hidden与下一个cell状态
    保存这个prediction作为预测句子中的一部分
    决定是否使用"teacher force":
        如果使用：解码器的下一次input是真实的token
        如果不使用：解码器的下一次input是预测prediction（使用output tensor的argmax）的token
    '''

    def __init__(self, predict_flag, encoder, decoder, intent_size):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.predict_flag = predict_flag
        # 意图分类
        self.intent_out = nn.Linear((encoder.hidden_dim * 2) + encoder.hidden_dim, intent_size)
        assert encoder.hidden_dim == decoder.hidden_dim, 'encoder与decoder的隐藏状态维度必须相等！'
        assert encoder.n_layers == decoder.n_layers, 'encoder与decoder的层数必须相等！'
        
    def forward(self, src, src_lens, trg, teacher_forcing_ration=1.0):
        '''
        src=[batch_size, seq_len]
        src_len=[batch_size]
        trg=[batch_size, trg_len]
        
        '''
        # 预测，一次输入一句话
        if self.predict_flag:
            assert len(src) == 1, '预测时一次输入一句话'
            src_len = len(src[0])
            output_tokens = []
            encoder_output, encoder_hidden = self.encoder(src, src_lens)
            hidden = encoder_hidden
            input = torch.tensor(2).unsqueeze(0)  # 预测阶段解码器输入第一个token-> <sos>
            for s in range(1, src_len):
                if s == 1:
                    # context = [batch_size, hidden_dim*2]
                    output, hidden, _, context = self.decoder(input, hidden, encoder_output)
                else:
                    output, hidden, _, _ = self.decoder(input, hidden, encoder_output)
                    
                input = output.argmax(1)
                output_token = input.squeeze().detach().item()
               
                output_tokens.append(output_token)
            concated = torch.cat((encoder_hidden, context), 1)
            intent_outputs = self.intent_out(concated)
            intent_outputs = intent_outputs.squeeze()
            intent_outputs = intent_outputs.argmax()
            return output_tokens, intent_outputs

        # 训练
        else:
            '''
            src=[batch_size, seq_len]
            trg=[batch_size, trg_len]
            teacher_forcing_ration是使用teacher forcing的概率,例如teacher_forcing_ration=0.8，则输入的时间步有80%的真实值。
            '''
            batch_size = trg.shape[0]
            trg_len = trg.shape[1]
            trg_vocab_size = self.decoder.output_dim
            # 存储decoder outputs
            slot_outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(device)
            # encoder的最后一层hidden state(前向＋后向)作为decoder的初始隐状态,[batch_size, seq_len, hidden_size*2]
            # hidden=[batch_size, hidden_size]
            encoder_output, encoder_hidden = self.encoder(
                src, src_lens)  
            hidden = encoder_hidden
            
            # 输入到decoder的第一个是<sos>
            input = trg[:, 0] # [batch_size]
            
            for t in range(1, trg_len):
                '''
                解码器输入的初始hidden为encoder的最后一步的hidden
                接收输出即predictions和新的hidden状态
                '''
                if t == 1:
                    # context = [batch_size, hidden_dim*2]
                    output, hidden, _, context = self.decoder(input, hidden, encoder_output)
                else:
                    output, hidden, _, _ = self.decoder(input, hidden, encoder_output)
                # 存入decoder的预测值
                slot_outputs[:, t, :] = output
                # 是否使用teacher forcing
                teacher_force = random.random() < teacher_forcing_ration
                # 获取预测的最大概率的token
                predict_max = output.argmax(1)
                '''
                如果是teacher forcing则下一步使用真实token作为解码的输入
                否则使用decoder的预测值作为下一步的解码输入
                '''
                input = trg[:, t] if teacher_force else predict_max
            # concated = [batch_size, hidden_dim * 2 + hidden_dim]
            concated = torch.cat((encoder_hidden, context), 1)
            intent_outputs = self.intent_out(concated)
            # slot_outputs=[batch_size, trg_len, trg_vocab_size], intetn_outputs=[batch_size, intent_size]
            return slot_outputs, intent_outputs


# 构建模型，优化函数，损失函数，学习率衰减函数
def build_model(source, target, label, encoder_embedding_dim, decoder_embedding_dim, hidden_dim, n_layers, encoder_dropout,
                decoder_dropout, lr, gamma, weight_decay):
    '''
    训练seq2seq model
    input与output的维度是字典的大小。
    encoder与decoder的embedding与dropout可以不同
    网络的层数与hiden/cell状态的size必须相同
    '''
    input_dim = len(source.vocab) # source 词典大小（即词数量）
    output_dim = len(target.vocab) # target 词典大小（即实体类型数量）
    label_dim = len(label.vocab) # label 词典大小（即意图类别数量）
    
    encoder = Encoder(input_dim, encoder_embedding_dim, hidden_dim, n_layers, encoder_dropout, source.vocab.stoi[source.pad_token])
    decoder = Decoder(output_dim, decoder_embedding_dim, hidden_dim, n_layers, decoder_dropout)

    model = Seq2Seq(False, encoder, decoder, label_dim).to(device)

    model.apply(init_weights)

    # 定义优化函数
    # optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    optimizer = torch.optim.SGD(model.parameters(),lr=lr)
    # 定义lr衰减
    scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
    # 这里忽略<pad>的损失。
    target_pad_index = target.vocab.stoi[source.pad_token]
    # 定义损失函数(实体识别)
    loss_slot = nn.CrossEntropyLoss(ignore_index=target_pad_index)
    # 定义损失函数(意图识别)
    loss_intent = nn.CrossEntropyLoss()
    
    return model, optimizer, scheduler, loss_slot, loss_intent


# 训练
def train(model, iterator, optimizer, loss_slot, loss_intent, clip):
    '''
    开始训练：
        1.得到source与target句子
        2.上一批batch的计算梯度归0
        3.给模型喂source与target，并得到输出output
        4.由于损失函数只适用于带有1维target和2维的input，我们需要用view进行flatten(在计算损失时，从output与target中忽略了第一列<sos>)
        5.反向传播计算梯度loss.backward()
        6.梯度裁剪，防止梯度爆炸
        7.更新模型参数
        8.损失值求和(返回所有batch的损失的均值)
    '''
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, src_lens = batch.source  # src=[batch_size, seq_len]，这里batch.src返回src和src的长度，因为在使用torchtext.Field时设置include_lengths=True
        trg, _ = batch.target  # trg=[batch_size, seq_len]
        label = batch.intent # [batch_size]
        src = src.to(device)
        trg = trg.to(device)
        label = label.to(device)
        
        #slot_outputs=[batch_size, trg_len, trg_vocab_size], intetn_outputs=[batch_size, intent_size]
        slot_outputs, intent_outputs = model(src, src_lens, trg, teacher_forcing_ration=1.0)
        
        # 以下在计算损失时，忽略了每个tensor的第一个元素及<sos>
        output_dim = slot_outputs.shape[-1]
        slot_outputs = slot_outputs[:, 1:, :].reshape(-1, output_dim)  # output=[batch_size * (seq_len - 1), output_dim]
        trg = trg[:, 1:].reshape(-1)  # trg=[batch_size * (seq_len - 1)]
        loss1 = loss_slot(slot_outputs, trg)
        loss2 = loss_intent(intent_outputs, label)
        loss = loss1 + loss2
        
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += float(loss.item())
        # print('epoch_loss:{}'.format(float(loss.item())))
    return epoch_loss / len(iterator)

'''
评估
'''
def evaluate(model, iterator, loss_slot, loss_intent):
    model.eval()  # 评估模型，切断dropout与batchnorm
    epoch_loss = 0
    with torch.no_grad():  # 不更新梯度
        for i, batch in enumerate(iterator):
            src, src_len = batch.source  # src=[batch_size, seq_len]
            trg, _ = batch.target  # trg=[batch_size, seq_len]
            label = batch.intent
            src = src.to(device)
            trg = trg.to(device)
            label = label.to(device)
            # output=[batch_size, seq_len, output_dim]
            slot_outputs, intent_outputs = model(src, src_len, trg, teacher_forcing_ration=0)  # 评估的时候不使用teacher force，使用预测作为每一步的输入

            output_dim = slot_outputs.shape[-1]
            slot_outputs = slot_outputs[:, 1:, :].reshape(-1, output_dim)  # output=[batch_size * (seq_len - 1), output_dim]
            trg = trg[:, 1:].reshape(-1)  # trg=[batch_size * (seq_len - 1)]
            loss1 = loss_slot(slot_outputs, trg)
            loss2 = loss_intent(intent_outputs, label)
            loss = loss1 + loss2
            epoch_loss += float(loss.item())
    return epoch_loss / len(iterator)


def train_model(model, train_iterator, val_iterator, optimizer, scheduler, loss_slot, loss_intent, n_epochs, clip, model_path, writer):
    '''
    开始训练我们的模型：
    1.每一次epoch，都会检查模型是否达到的最佳的validation loss，如果达到了，就更新
    最好的validation loss以及保存模型参数
    2.打印每个epoch的loss以及困惑度。
    '''
    best_valid_loss = float('inf')
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = train(model, train_iterator, optimizer, loss_slot, loss_intent, clip)
        writer.add_scalar('loss',train_loss,global_step=epoch+1)
        
        valid_loss = evaluate(model, val_iterator, loss_slot, loss_intent)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_path)
        # scheduler.step()
        print('epoch:{},time-mins:{},time-secs:{}'.format(epoch + 1, epoch_mins, epoch_secs))
        print('train loss:{},train perplexity:{}'.format(train_loss, math.exp(train_loss)))
        print('val loss:{}, val perplexity:{}'.format(valid_loss, math.exp(valid_loss)))
    writer.flush()
    writer.close()

    #每个epoch所花时间
def epoch_time(start_time, end_time):
    run_tim = end_time - start_time
    run_mins = int(run_tim / 60)
    run_secs = int(run_tim-(run_mins * 60))
    return run_mins,run_secs

#对所有模块和子模块进行权重初始化
def init_weights(model):
    for name,param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [16]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(os.getcwd()+'/log', comment='intent_slot')


encoder_embedding_dim = 128
decoder_embedding_dim = 128
hidden_dim = 256
n_layers = 1
encoder_dropout = 0.1
decoder_dropout = 0.1
lr = 0.01
gamma = 0.1
weight_decay = 0.1
n_epochs = 10
clip = 1.0
model_path = os.path.join(os.getcwd(), "model.h5")

model, optimizer, scheduler, loss_slot, loss_intent = build_model(SOURCE,
                                                  TARGET,
                                                  LABEL,
                                                  encoder_embedding_dim,
                                                  decoder_embedding_dim,
                                                  hidden_dim,
                                                  n_layers,
                                                  encoder_dropout,
                                                  decoder_dropout,
                                                  lr,
                                                  gamma,
                                                  weight_decay)

model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

train_model(model,
            train_iter,
            val_iter,
            optimizer,
            scheduler,
            loss_slot, 
            loss_intent,
            n_epochs,
            clip,
            model_path,
            writer)



Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
epoch:1,time-mins:0,time-secs:40
train loss:5.815643066014999,train perplexity:335.5070823466472
val loss:4.272768020629883, val perplexity:71.71988323225747
epoch:2,time-mins:0,time-secs:40
train loss:3.4767637925270276,train perplexity:32.35484547346487
val loss:3.627048969268799, val perplexity:37.60168898755