In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import numpy as np
import os
import json
import random
    
# load source words
source_words_path = os.path.join(os.getcwd(), 'source_words.pkl')
with open(source_words_path, 'rb') as f_source_words:
    source_words = pickle.load(f_source_words)
    
# load target words
target_words_path = os.path.join(os.getcwd(), 'target_words.pkl')
with open(target_words_path, 'rb') as f_target_words:
    target_words = pickle.load(f_target_words)
    
# load label words
label_words_path = os.path.join(os.getcwd(), 'label_words.pkl')
with open(label_words_path, 'rb') as f_label_words:
    label_words = pickle.load(f_label_words)

In [2]:
print(len(source_words))
print(len(target_words))
print(len(label_words))
print(source_words['<pad>'])
print(source_words['<eos>'])
print(source_words['<sos>'])
print(source_words['<unk>'])

945
133
27
1
3
2
0


In [11]:
# 构建attention权重计算方式
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear((hidden_dim * 2), hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def concat_score(self, hidden, encoder_output):
        seq_len = encoder_output.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1) # [batch_size, seq_len, hidden_size]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_output),dim=2))) # [batch_size, seq_len, hidden_dim]
        attention = self.v(energy).squeeze(2) #[batch_size, seq_len]
        return attention #[batch_size, seq_len]

    def forward(self, hidden, encoder_output):
        # hidden = [batch_size, hidden_size]
        # #encoder_output=[batch_size, seq_len, hidden_size]
        
        attn_energies = self.concat_score(hidden, encoder_output)

        return F.softmax(attn_energies, dim=1).unsqueeze(1) #softmax归一化，[batch_size, 1, seq_len]
    
#构建模型
class BirnnAttention(nn.Module):
    def __init__(self, source_input_dim, source_emb_dim, hidden_dim, n_layers, dropout, pad_index, slot_output_size, intent_output_size, slot_embed_dim, predict_flag):
        super(BirnnAttention, self).__init__()
        self.pad_index = pad_index
        self.hidden_dim = hidden_dim//2 # 双向lstm
        self.n_layers = n_layers
        self.slot_output_size = slot_output_size
        # 是否预测模式
        self.predict_flag = predict_flag
        
        self.source_embedding = nn.Embedding(source_input_dim, source_emb_dim, padding_idx=pad_index)
        # 双向gru，隐层维度是hidden_dim
        self.source_gru = nn.GRU(source_emb_dim, self.hidden_dim, n_layers, dropout=dropout, bidirectional=True, batch_first=True) #使用双向
        
        
        # 单个cell的隐层维度与gru隐层维度一样，为hidden_dim
        self.gru_cell = nn.GRUCell(slot_embed_dim + (2 * hidden_dim), hidden_dim)
        self.attention = Attention(hidden_dim)
        # 意图intent预测
        self.intent_output = nn.Linear(hidden_dim * 2, intent_output_size)
        # 槽slot预测
        self.slot_output = nn.Linear(hidden_dim, slot_output_size)
        self.slot_embedding = nn.Embedding(slot_output_size, slot_embed_dim)
        
    def forward(self, source_input, source_len):
        '''
        source_input:[batch_size, seq_len]
        source_len:[batch_size]
        '''
        if self.predict_flag:
            assert len(source_input) == 1, '预测时一次输入一句话'
            seq_len = source_len[0]
            
            # 1.Encoder阶段，将输入的source进行编码
            # source_embedded:[batch_size, seq_len, source_emb_dim]
            source_embedded = self.source_embedding(source_input)
            packed = torch.nn.utils.rnn.pack_padded_sequence(source_embedded, source_len, batch_first=True, enforce_sorted=True) #这里enfore_sotred=True要求数据根据词数排序
            source_output, hidden = self.source_gru(packed)
            # source_output=[batch_size, seq_len, 2 * self.hidden_size]，这里的2*self.hidden_size = hidden_dim
            # hidden=[n_layers * 2, batch_size, self.hidden_size]
            source_output, _ = torch.nn.utils.rnn.pad_packed_sequence(source_output, batch_first=True, padding_value=self.pad_index, total_length=len(source_input[0])) #这个会返回output以及压缩后的legnths
            '''
            source_hidden[-2,:,:]是gru最后一步的forward
            source_hidden[-1,:,:]是gru最后一步的backward
            '''
            # source_hidden=[batch_size, 2*self.hidden_size]
            source_hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
            #保存注意力向量
            attention_context = torch.zeros(1, seq_len, self.hidden_dim * 2)
            output_tokens = []
           
            aligns = source_output.transpose(0,1) #对齐向量
            
            input = torch.tensor(2).unsqueeze(0)  # 预测阶段解码器输入第一个token-> <sos>
            
            for s in range(seq_len):
                aligned = aligns[s].unsqueeze(1)# [batch_size, 1, hidden_size*2]
                 # embedded=[1, 1, slot_embed_dim]
                slot_embedded = self.slot_embedding(input)
                slot_embedded = slot_embedded.unsqueeze(0)
                # 利用利用上一步的hidden与encoder_output，计算attention权重
                # attention_weights=[batch_size, 1, seq_len]
                attention_weights = self.attention(source_hidden, source_output)

                '''
                以下是计算上下文：利用attention权重与encoder_output计算attention上下文向量
                注意力权重分布用于产生编码器隐藏状态的加权和，加权平均的过程。得到的向量称为上下文向量
                '''
                context = attention_weights.bmm(source_output)
                attention_context[:,s,:] = context
              
                combined_grucell_input = torch.cat([aligned, slot_embedded, context], dim =2)
             
                source_hidden = self.gru_cell(combined_grucell_input.squeeze(1), source_hidden)
              
                slot_prediction = self.slot_output(source_hidden)
            
                input = slot_prediction.argmax(1)
                output_token = input.squeeze().detach().item()
               
                output_tokens.append(output_token)
            
             #意图识别
            #拼接注意力向量和encoder的输出
            combined_attention_sourceoutput = torch.cat([attention_context, source_output], dim=2)
            intent_outputs = self.intent_output(torch.mean(combined_attention_sourceoutput, dim = 1))
            intent_outputs = intent_outputs.squeeze()
            intent_outputs = intent_outputs.argmax()
            return output_tokens, intent_outputs
        
        else:
            # 1.Encoder阶段，将输入的source进行编码
            # source_embedded:[batch_size, seq_len, source_emb_dim]
            source_embedded = self.source_embedding(source_input)
            packed = torch.nn.utils.rnn.pack_padded_sequence(source_embedded, source_len, batch_first=True, enforce_sorted=True) #这里enfore_sotred=True要求数据根据词数排序
            source_output, hidden = self.source_gru(packed)
            # source_output=[batch_size, seq_len, 2 * self.hidden_size]，这里的2*self.hidden_size = hidden_dim
            # hidden=[n_layers * 2, batch_size, self.hidden_size]
            source_output, _ = torch.nn.utils.rnn.pad_packed_sequence(source_output, batch_first=True, padding_value=self.pad_index, total_length=len(source_input[0])) #这个会返回output以及压缩后的legnths
            '''
            source_hidden[-2,:,:]是gru最后一步的forward
            source_hidden[-1,:,:]是gru最后一步的backward
            '''
            # source_hidden=[batch_size, 2*self.hidden_size]
            source_hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)


            # 2.Decoder阶段，预测slot与intent
            batch_size = source_input.shape[0]
            seq_len = source_input.shape[1]
            # 保存slot的预测概率
            slot_outputs = torch.zeros(batch_size, seq_len, self.slot_output_size).to(device)

            #保存注意力向量
            attention_context = torch.zeros(batch_size, seq_len, self.hidden_dim * 2).to(device)

            # 每个batch数据的第一个字符<sos>对应的是index是2
            input = torch.tensor(2).repeat(batch_size).to(device)
            aligns = source_output.transpose(0,1) # 利用encoder output最后一层的每一个时间步
            # 槽识别
            for t in range(1, seq_len):
                '''
                解码器输入的初始hidden为encoder的最后一步的hidden
                接收输出即predictions和新的hidden状态
                '''
                aligned = aligns[t].unsqueeze(1)# [batch_size, 1, hidden_size] # hidden_size包含前向和后向隐状态向量
                input = input.unsqueeze(1)
                # input=[batch_size, 1]
                # hidden=[batch_size, hidden_size] 初始化为encoder的最后一层 [batch_size, hidden_size]
                # encoder_output=[batch_size, seq_len, hidden_dim*2]
                # aligned=[batch_size, 1, hidden_dim*2]

                # embedded=[batch_sze, 1, slot_embed_dim]
                slot_embedded = self.slot_embedding(input)

                # 利用利用上一步的hidden与encoder_output，计算attention权重
                # attention_weights=[batch_size, 1, seq_len]
                attention_weights = self.attention(source_hidden, source_output)

                '''
                以下是计算上下文：利用attention权重与encoder_output计算attention上下文向量
                注意力权重分布用于产生编码器隐藏状态的加权和，加权平均的过程。得到的向量称为上下文向量
                '''
                context = attention_weights.bmm(source_output) # [batch_size, 1, seq_len] * [batch_size, seq_len, hidden_dim]=[batch_size, 1, hidden_dim]
                attention_context[:,t,:] = context.squeeze(1)
                #combined_grucell_input=[batch_size, 1, (hidden_size + slot_embed_dim + hidden_dim)]
                combined_grucell_input = torch.cat([aligned, slot_embedded, context], dim =2)
                # [batch_size, hidden_dim]
                source_hidden = self.gru_cell(combined_grucell_input.squeeze(1), source_hidden)
                # 预测slot, [batch_size, slot_output_size]
                slot_prediction = self.slot_output(source_hidden)
                slot_outputs[:, t, :] = slot_prediction
                # 获取预测的最大概率的token
                input = slot_prediction.argmax(1)
            #意图识别
            #拼接注意力向量和encoder的输出，[batch_size, seq_len, hidden_dim * 2]
            combined_attention_sourceoutput = torch.cat([attention_context, source_output], dim=2)
            intent_outputs = self.intent_output(torch.mean(combined_attention_sourceoutput, dim = 1))

            return slot_outputs, intent_outputs

In [13]:
source_emb_dim = 64
slot_embed_dim = 64
hidden_dim = 128
n_layers = 1
dropout = 0.5

model_path = os.path.join(os.getcwd(), "model.h5")

input_dim = len(source_words) # source 词典大小（即词数量）
output_dim = len(target_words) # target 词典大小（即实体类型数量）
label_dim = len(label_words) # label 词典大小（即意图类别数量）

model = BirnnAttention(input_dim, source_emb_dim, hidden_dim, n_layers, dropout, source_words['<pad>'], output_dim, label_dim, slot_embed_dim, True)
   
model.load_state_dict(torch.load(model_path))
model.eval()

sentence = "i would like to find a flight from charlotte to las vegas that makes a stop in st. louis"
sentence2 = "which airlines have first class flights today"
with torch.no_grad():
    tokenized = sentence.split()  # tokenize the sentence
    tokenized.append('<eos>')
    indexed = [source_words[t] for t in tokenized]  # convert to integer sequence
    print(tokenized)
    print(indexed)
    tensor = torch.LongTensor(indexed)  # convert to tensor
    tensor = tensor.unsqueeze(0)  # reshape in form of batch,no. of words
    slot_outputs, intent_outputs = model(tensor, [len(tensor[0])])  # prediction
    intent = intent_outputs.detach().item()
    slot_prediction = [target_words.itos[t] for t in slot_outputs]

    print('slot_prediciton:{}'.format(' '.join(slot_prediction)))
    print('intent_prediction:{}'.format(label_words.itos[intent]))

['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'st.', 'louis', '<eos>']
[13, 40, 29, 4, 87, 16, 11, 5, 100, 4, 90, 89, 34, 345, 16, 127, 18, 67, 144, 3]
slot_prediciton:o o o o o o o o b-fromloc.city_name o b-toloc.city_name i-toloc.city_name o o o o o b-stoploc.city_name i-stoploc.city_name <eos>
intent_prediction:flight
