此模型原型来自序列模型[Convolutional Sequence to Sequence Learning](https://arxiv.org/pdf/1705.03122.pdf)中的encoder部分。

原模型是用于机器翻译，这里我将稍加修改用来做问答中的slot filling和intent detection联合建模。

我这里进行了改进：如下：

![model](img/model5.png)

    本项目在此基础上进行了改进，改进点如下：
        1.加入了多个size的卷积，获取更多的特征，最后将这多个size的卷积进行连接。
        2.在embedding层后使用了一个多头注意力self-attention。
        3.最后将卷积后的特征和self-attention后的特征进行连接。

In [1]:
import os
from torchtext import data, datasets
import pandas as pd
import pickle

In [2]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
atis_data = os.path.join(base_dir, 'atis')

In [3]:
'''
build train and val dataset
'''
    
tokenize = lambda s:s.split()

SOURCE = data.Field(sequential=True, tokenize=tokenize,
                    lower=True, use_vocab=True,
                    init_token='<sos>', eos_token='<eos>',
                    pad_token='<pad>', unk_token='<unk>',
                    batch_first=True, fix_length=50,
                    include_lengths=True) #include_lengths=True为方便之后使用torch的pack_padded_sequence

TARGET = data.Field(sequential=True, tokenize=tokenize,
                    lower=True, use_vocab=True,
                    init_token='<sos>', eos_token='<eos>',
                    pad_token='<pad>', unk_token='<unk>',
                    batch_first=True, fix_length=50,
                    include_lengths=True) #include_lengths=True为方便之后使用torch的pack_padded_sequence
LABEL = data.Field(
                sequential=False,
                use_vocab=True)

train, val = data.TabularDataset.splits(
                                        path=atis_data,
                                        skip_header=True,
                                        train='atis.train.csv',
                                        validation='atis.test.csv',
                                        format='csv',
                                        fields=[('index', None), ('intent', LABEL), ('source', SOURCE), ('target', TARGET)])

SOURCE.build_vocab(train, val)
TARGET.build_vocab(train, val)
LABEL.build_vocab(train, val)

train_iter, val_iter = data.Iterator.splits(
                                            (train, val),
                                            batch_sizes=(64, len(val)), # 训练集设置为32,验证集整个集合用于测试
                                            shuffle=True,
                                            sort_within_batch=True, #为true则一个batch内的数据会按sort_key规则降序排序
                                            sort_key=lambda x: len(x.source)) #这里按src的长度降序排序，主要是为后面pack,pad操作)

In [4]:
# save source words
source_words_path = os.path.join(os.getcwd(), 'source_words.pkl')
with open(source_words_path, 'wb') as f_source_words:
    pickle.dump(SOURCE.vocab, f_source_words)

# save target words
target_words_path = os.path.join(os.getcwd(), 'target_words.pkl')
with open(target_words_path, 'wb') as f_target_words:
    pickle.dump(TARGET.vocab, f_target_words)
    
# save label words
label_words_path = os.path.join(os.getcwd(), 'label_words.pkl')
with open(label_words_path, 'wb') as f_label_words:
    pickle.dump(LABEL.vocab, f_label_words)

In [5]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import math
from apex import amp
import time

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark=False

In [7]:
'''
这里将卷积后的特征与经过多头注意力后的特征进行融合
'''
class CNNAttention(nn.Module):
    def __init__(self, input_dim, intent_out, slot_out, hid_dim, n_layers, kernel_size, dropout, src_pad_idx, n_heads, max_length=50):
        super(CNNAttention, self).__init__()
        for kernel in kernel_size:
            assert kernel % 2 == 1,'kernel size must be odd!' # 卷积核size为奇数，方便序列两边pad处理
        
        self.src_pad_idx = src_pad_idx
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device) # 确保整个网络的方差不会发生显著变化
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim) # token编码
        self.pos_embedding = nn.Embedding(max_length, hid_dim) # token的位置编码
        
        self.hid2hid = nn.Linear(hid_dim * 2, hid_dim) # 线性层，从2 * hid_dim转为hid_dim
        
        # 不同的kernel_size
        '''
        self.conv_module = list()
        for k in kernel_size:
            conv = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
                                                  out_channels=2*hid_dim, # 卷积后输出的维度，这里2*hid_dim是为了后面的glu激活函数
                                                  kernel_size=k,
                                                  padding=(k - 1)//2) # 序列两边补0个数，保持维度不变
                                                  for _ in range(n_layers)])
            self.conv_module.append(conv)
        '''
        
        self.conv_1 = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
                                                  out_channels=2*hid_dim, # 卷积后输出的维度，这里2*hid_dim是为了后面的glu激活函数
                                                  kernel_size=kernel_size[0],
                                                  padding=(kernel_size[0] - 1)//2) # 序列两边补0个数，保持维度不变
                                                  for _ in range(n_layers)])
        self.conv_2 = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
                                                  out_channels=2*hid_dim, # 卷积后输出的维度，这里2*hid_dim是为了后面的glu激活函数
                                                  kernel_size=kernel_size[1],
                                                  padding=(kernel_size[1] - 1)//2) # 序列两边补0个数，保持维度不变
                                                  for _ in range(n_layers)])
        self.conv_3 = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
                                                  out_channels=2*hid_dim, # 卷积后输出的维度，这里2*hid_dim是为了后面的glu激活函数
                                                  kernel_size=kernel_size[2],
                                                  padding=(kernel_size[2] - 1)//2) # 序列两边补0个数，保持维度不变
                                                  for _ in range(n_layers)])
        
        # 几个卷积模块转换维度
        self.convhid2hid = nn.Linear(len(kernel_size) * hid_dim, hid_dim)
        
        # 多头注意力模块
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
        
        self.dropout = nn.Dropout(dropout)
        
        # intent detection 意图识别
        self.intent_output = nn.Linear(hid_dim, intent_out)
        
         # slot filling，槽填充
        self.slot_out = nn.Linear(hid_dim, slot_out)
    
    def make_src_mask(self, src):
        # src: [batch_size, src_len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2) # [batch_size, 1, 1, src_len]
        
        return src_mask
        
    def forward(self, src):
        # src: [batch_size, src_len]
        # src_mask: [batch_size, src_len]
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        src_mask = self.make_src_mask(src) # [batch_size, 1, 1, src_len]
        
        # 创建token位置信息
        pos = torch.arange(src_len).unsqueeze(0).repeat(batch_size, 1).to(device) # [batch_size, src_len]
        
        # 对token与其位置进行编码
        tok_embedded = self.tok_embedding(src) # [batch_size, src_len, hid_dim]
        pos_embedded = self.pos_embedding(pos.long()) # [batch_size, src_len, hid_dim]
        
        # 对token embedded和pos_embedded逐元素加和
        embedded = self.dropout(tok_embedded + pos_embedded) # [batch_size, src_len, hid_dim]
        
        # 转变维度，卷积在输入数据的最后一维进行
        conv_input = embedded.permute(0, 2, 1) # [batch_size, hid_dim, src_len]
        
        '''
        combine_conv_module_list = []
        for conv_module in self.conv_module:
            conved_input = conv_input
            # 以下进行卷积块
            for i, conv in enumerate(conv_module):
                
                # 进行卷积
                conved = conv(self.dropout(conved_input)) # [batch_size, 2*hid_dim, src_len]

                # 进行激活glu
                conved = F.glu(conved, dim=1) # [batch_size, hid_dim, src_len]

                # 进行残差连接
                conved = (conved + conved_input) * self.scale # [batch_size, hid_dim, src_len]

                # 作为下一个卷积块的输入
                conved_input = conved
                
            combine_conv_module_list.append(conved)
            
        # 拼接几个卷积块特征: [batch_size, len(kernel_size) * hid_dim, src_len]
        combine_conv_module = combine_conv_module_list[0]
        for i in range(1, len(combine_conv_module_list)):
            combine_conv_module = torch.cat([combine_conv_module, combine_conv_module_list[i]], dim = 1)
        '''
        
        # 第一个kernel_size
        conved_input = conv_input
        for i, conv in enumerate(self.conv_1):
            # 进行卷积
            conved1 = conv(self.dropout(conved_input)) # [batch_size, 2*hid_dim, src_len]

            # 进行激活glu
            conved1 = F.glu(conved1, dim=1) # [batch_size, hid_dim, src_len]

            # 进行残差连接
            conved1 = (conved1 + conved_input) * self.scale # [batch_size, hid_dim, src_len]

            # 作为下一个卷积块的输入
            conved_input = conved1
        
        combine_conv_module = conved1
        
        # 第二个kernel_size
        conved_input = conv_input
        for i, conv in enumerate(self.conv_2):
            # 进行卷积
            conved2 = conv(self.dropout(conved_input)) # [batch_size, 2*hid_dim, src_len]

            # 进行激活glu
            conved2 = F.glu(conved2, dim=1) # [batch_size, hid_dim, src_len]

            # 进行残差连接
            conved2 = (conved2 + conved_input) * self.scale # [batch_size, hid_dim, src_len]

            # 作为下一个卷积块的输入
            conved_input = conved2
            
        combine_conv_module = torch.cat([combine_conv_module, conved2], dim = 1)
        
        # 第三个kernel_size
        conved_input = conv_input
        for i, conv in enumerate(self.conv_3):
            # 进行卷积
            conved3 = conv(self.dropout(conved_input)) # [batch_size, 2*hid_dim, src_len]

            # 进行激活glu
            conved3 = F.glu(conved3, dim=1) # [batch_size, hid_dim, src_len]

            # 进行残差连接
            conved3 = (conved3 + conved_input) * self.scale # [batch_size, hid_dim, src_len]

            # 作为下一个卷积块的输入
            conved_input = conved3
            
        combine_conv_module = torch.cat([combine_conv_module, conved3], dim = 1)
        
        
        
        conved = self.convhid2hid(combine_conv_module.permute(0, 2, 1)) # [batch_size, src_len, hid_dim]
        
        # 这里在所有卷积之后增加了一个多头自注意力层，它的输入是
        self_attention, _ = self.self_attention(embedded, embedded, embedded, src_mask) # [batch_size, query_len, hid_dim]
        
        # 拼接卷积后的特征与多头注意力后的特征
        combined_conv_attention = torch.cat([conved, self_attention], dim=2) # [batch_size, query_len, 2*hid_dim]
        
        # 经过一线性层，将2*hid_dim转为hid_dim，作为输出的特征
        conved = self.hid2hid(combined_conv_attention) # [batch_size, query_len, hid_dim]
        
        # 又是一个残差连接，逐元素加和输出，作为encoder的联合输出特征
        combined = (conved + embedded) * self.scale # [batch_size, src_len, hid_dim]
        
        # 意图识别,加一个平均池化,池化后的维度是：[batch_size, hid_dim]
        intent_output = self.intent_output(self.dropout(F.max_pool1d(combined.permute(0, 2, 1), combined.shape[1]).squeeze())) # [batch_size, intent_dim]
    
        # 槽填充
        slot_output = self.slot_out(self.dropout(combined)) # [batch_size, trg_len, output_dim]
        
        return intent_output, slot_output
 
'''
多头注意力multi-head attention
'''
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout):
        super(MultiHeadAttentionLayer, self).__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.hid_dim])).to(device) # 缩放因子
        
    def forward(self, query, key, value, mask=None):
        '''
        query: [batch_size, query_len, hid_dim]
        key: [batch_size, key_len, hid_dim]
        value: [batch_size, value_len, hid_dim]
        '''
        batch_size = query.shape[0]
        
        Q = self.fc_q(query) # [batch_size, query_len, hid_dim]
        K = self.fc_k(key) # [batch_size, key_len, hid_dim]
        V = self.fc_v(value) # [batch_size, value_len, hid_dim]
        
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch_size, n_heads, query_len, head_dim]
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch_size, n_heads, key_len, head_dim]
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch_size, n_heads, value_len, head_dim]
        
        # [batch_size, n_heads, query_len, head_dim] * [batch_size, n_heads, head_dim, key_len]
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale # [batch_size, n_heads, query_len, key_len]
        
        if mask != None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim=-1) # [batch_size, n_heads, query_len, key_len]
        
        # [batch_size, n_heads, query_len, key_len] * [batch_size, n_heads, value_len, head_dim]
        x = torch.matmul(self.dropout(attention), V) # [batch_size, n_heads, query_len, head_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous() # [batch_size, query_len, n_heads, head_dim]
        
        x = x.view(batch_size, -1, self.hid_dim) # [batch_size, query_len, hid_dim]
        
        x = self.fc_o(x) # [batch_size, query_len, hid_dim]
        
        return x, attention


In [16]:
'''
定义seq2seq model
'''
input_dim = len(SOURCE.vocab)
slot_out = len(TARGET.vocab) # slot size
intent_out = len(LABEL.vocab) # intent size

hid_dim = 64
conv_layers = 8
kernel_size = (1,3,5) # 卷积核size
dropout = 0.5
n_heads = 8

src_pad_idx = SOURCE.vocab.stoi[SOURCE.pad_token]

model = CNNAttention(input_dim, intent_out, slot_out, hid_dim, conv_layers, kernel_size, dropout, src_pad_idx, n_heads)

model = model.to(device)

# 优化函数
optimizer = optim.Adam(model.parameters())

# 损失函数(slot)
loss_slot = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

# 定义损失函数(意图识别)
loss_intent = nn.CrossEntropyLoss()

In [17]:
# 训练
def train(model, iterator, optimizer, loss_slot, loss_intent, clip):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src, _ = batch.source  # src=[batch_size, seq_len]，这里batch.src返回src和src的长度，因为在使用torchtext.Field时设置include_lengths=True
        trg, _ = batch.target  # trg=[batch_size, seq_len]
        label = batch.intent # [batch_size]
        src = src.to(device)
        trg = trg.to(device)
        label = label.to(device)
        
        optimizer.zero_grad()
        
        intent_output, slot_output = model(src) # [batch_size, intent_dim]; [batch_size, trg_len, slot_output_dim]
        
        # 1.计算slot loss
        slot_output_dim = slot_output.shape[-1]
        
        slot_output = slot_output[:, 1:, :].reshape(-1, slot_output_dim) # [batch_size * (trg_len-1), slot_output_dim]
        
        trg = trg[:,1:].contiguous().view(-1) # [batch_size * (trg_len-1)]
        
        # 1.计算slot loss
        loss1 = loss_slot(slot_output, trg)
        
        # 2.计算intent loss
        loss2 = loss_intent(intent_output, label)
        
        # 3.联合slot loss + intent loss
        loss = loss1 + loss2
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)
        

In [18]:
# val loss
def evaluate(model, iterator, loss_slot, loss_intent):
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, _ = batch.source  # src=[batch_size, seq_len]
            trg, _ = batch.target  # trg=[batch_size, seq_len]
            label = batch.intent
            src = src.to(device)
            trg = trg.to(device)
            label = label.to(device)
            
            intent_output, slot_output = model(src) # [batch_size, intent_dim]; [batch_size, trg_len-1, slot_output_dim]
            
            # 1.计算slot loss
            slot_output_dim = slot_output.shape[-1]

            slot_output = slot_output[:, 1:, :].reshape(-1, slot_output_dim) # [batch_size * (trg_len-1), slot_output_dim]

            trg = trg[:,1:].contiguous().view(-1) # [batch_size * (trg_len-1)]

            loss1 = loss_slot(slot_output, trg)

            # 2.计算intent loss
            loss2 = loss_intent(intent_output, label)

            loss = loss1 + loss2
        
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [19]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
n_epochs = 100 # 迭代次数
clip = 0.1 # 梯度裁剪

model_path = os.path.join(os.getcwd(), "model.h5")

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, loss_slot, loss_intent, clip)
    valid_loss = evaluate(model, val_iter, loss_slot, loss_intent)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time) # 每个epoch花费的时间
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 3s
	Train Loss: 4.468 | Train PPL:  87.153
	 Val. Loss: 2.789 |  Val. PPL:  16.264
Epoch: 02 | Time: 0m 4s
	Train Loss: 2.428 | Train PPL:  11.334
	 Val. Loss: 2.031 |  Val. PPL:   7.619
Epoch: 03 | Time: 0m 3s
	Train Loss: 1.953 | Train PPL:   7.053
	 Val. Loss: 1.748 |  Val. PPL:   5.743
Epoch: 04 | Time: 0m 3s
	Train Loss: 1.726 | Train PPL:   5.617
	 Val. Loss: 1.565 |  Val. PPL:   4.781
Epoch: 05 | Time: 0m 3s
	Train Loss: 1.558 | Train PPL:   4.750
	 Val. Loss: 1.461 |  Val. PPL:   4.311
Epoch: 06 | Time: 0m 3s
	Train Loss: 1.443 | Train PPL:   4.235
	 Val. Loss: 1.324 |  Val. PPL:   3.760
Epoch: 07 | Time: 0m 4s
	Train Loss: 1.339 | Train PPL:   3.814
	 Val. Loss: 1.261 |  Val. PPL:   3.527
Epoch: 08 | Time: 0m 3s
	Train Loss: 1.247 | Train PPL:   3.481
	 Val. Loss: 1.243 |  Val. PPL:   3.465
Epoch: 09 | Time: 0m 3s
	Train Loss: 1.226 | Train PPL:   3.407
	 Val. Loss: 1.105 |  Val. PPL:   3.020
Epoch: 10 | Time: 0m 3s
	Train Loss: 1.136 | Train PPL:   3.113
