In [9]:
import math
import torch
import numpy as np

### 1. 数据预处理

In [10]:
# S: 起始标记
# E: 结束标记
# P: padding，将当前序列补齐至最长序列长度的占位符
sentence = [
    # enc_input dec_input dec_output
    ['ich mochte ein bier P','S i want a beer .', 'i want a beer . E'],
    ['ich mochte ein cola P','S i want a coke .', 'i want a coke . E'],
]

# 词典，padding用0来表示

# 源词典
src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4, 'cola': 5}
src_vocab_size = len(src_vocab) # 6
# 目标词典（包含特殊符）
tgt_vocab = {'P':0,'i':1,'want':2,'a':3,'beer':4,'coke':5,'S':6,'E':7,'.':8}
# 反向映射词典，idx --> word
idx2word = {v: k for k, v in tgt_vocab.items()}
'''
{0: 'P',
 1: 'i',
 2: 'want',
 3: 'a',
 4: 'beer',
 5: 'coke',
 6: 'S',
 7: 'E',
 8: '.'}
'''
tgt_vocab_size = len(tgt_vocab) # 9

src_len = 5 # 输入序列enc_input的最长序列长度，其实就是最长的那句话的token数
tgt_len = 6 # 输出序列dec_input/dec_output的最长序列长度


In [11]:
sentence

[['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
 ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']]

In [5]:
src_vocab

{'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4, 'cola': 5}

In [6]:
tgt_vocab

{'P': 0,
 'i': 1,
 'want': 2,
 'a': 3,
 'beer': 4,
 'coke': 5,
 'S': 6,
 'E': 7,
 '.': 8}

In [12]:
# 这个函数把原始输入序列转换成token表示
def make_data(sentence):
    enc_inputs, dec_inputs, dec_outputs = [], [], []
    for i in range(len(sentence)):
        enc_input = [src_vocab[word] for word in sentence[i][0].split()]
        dec_input = [tgt_vocab[word] for word in sentence[i][1].split()]
        dec_output = [tgt_vocab[word] for word in sentence[i][2].split()]
        
        enc_inputs.append(enc_input)
        dec_inputs.append(dec_input)
        dec_outputs.append(dec_output)
    
    # LongTensor是专用于存储整型的，Tensor则可以存浮点、整数、bool等多种类型
    return torch.LongTensor(enc_inputs), torch.LongTensor(dec_inputs), torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = make_data(sentence)

print(' enc_inputs: \n', enc_inputs)  # enc_inputs: [2,5]
print(' dec_inputs: \n', dec_inputs)  # dec_inputs: [2,6]
print(' dec_outputs: \n', dec_outputs) # dec_outputs: [2,6]

 enc_inputs: 
 tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]])
 dec_inputs: 
 tensor([[6, 1, 2, 3, 4, 8],
        [6, 1, 2, 3, 5, 8]])
 dec_outputs: 
 tensor([[1, 2, 3, 4, 8, 7],
        [1, 2, 3, 5, 8, 7]])


In [13]:
# 使用Dataset加载数据
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_outputs):
        super(MyDataSet, self).__init__()
        self.enc_inputs = enc_inputs
        self.dec_inputs = dec_inputs
        self.dec_outputs = dec_outputs
        
    def __len__(self):
        # enc_inputs.shape = [2, 5], 返回的是2
        return self.enc_inputs.shape[0]
    
    def __getitem__(self, idx):
        return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]

# 使用DataLoader加载数据
loader = torch.utils.data.DataLoader(dataset=MyDataSet(enc_inputs, dec_inputs, dec_outputs), batch_size=2, shuffle=True)

### 2.模型参数

In [16]:
# 一个词的向量长度
d_model = 512

# FFN的隐藏层神经元个数
d_ff = 2048

# 多头注意力后的q, k, v词向量长度，这里是512/8=64
# 原文：queries and kes of dimention d_k,and values of dimension d_v .所以q和k的长度都用d_k来表示
d_k = d_v = 64

# Encoder Layer 和 Decoder Layer 的个数
n_layers = 6

# 多头注意力中head的个数，原文：we employ h = 8 parallel attention layers, or heads
n_heads = 8


- Transformer包含Encoder和Decoder
- Encoder和Decoder各自包含6个Layer
- Encoder Layer中包含 Self Attention 和 FFN 两个Sub Layer
- Decoder Layer中包含 Masked Self Attention、 Cross Attention、 FFN 三个Sub Layer

### 3. Positional Encoding

> 用于为输入的词向量进行位置编码

原文：The positional encodings have the same dimension d_model as the embeddings, so that the two can be summed

In [37]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        # 开始位置编码部分,先生成一个max_len * d_model 的矩阵，即5000 * 512
        # 5000是一个句子中最多的token数，512是一个token用多长的向量来表示，5000*512这个矩阵用于表示一个句子的信息
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # pos：[max_len,1],即[5000,1]
        div_term = pos / pow(10000.0, torch.arange(0, d_model, 2).float() / d_model) # div_term：[5000,256]
        
        pe[:, 0::2] = torch.sin(div_term)
        pe[:, 1::2] = torch.cos(div_term)
        # 一个句子要做一次pe，一个batch中会有多个句子，所以增加一维用来和输入的一个batch的数据相加时做广播
        pe = pe.unsqueeze(0) # [5000,512] -> [1,5000,512] 
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        '''x: [batch_size, seq_len, d_model]'''
        # 5000是我们预定义的最大的seq_len，就是说我们把最多的情况pe都算好了，用的时候用多少就取多少
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)        

In [17]:
pe = torch.zeros(10, 6)

pos = torch.arange(0, 10, dtype=torch.float).unsqueeze(1)
div_term = pos / pow(10000.0, torch.arange(0, 6, 2).float() / 6) # div_term：[5000,256]
print(div_term)
pe[:, 0::2] = torch.sin(div_term)
# pe[:, 1::2] = torch.cos(div_term)
print(pe)

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0000e+00, 4.6416e-02, 2.1544e-03],
        [2.0000e+00, 9.2832e-02, 4.3089e-03],
        [3.0000e+00, 1.3925e-01, 6.4633e-03],
        [4.0000e+00, 1.8566e-01, 8.6177e-03],
        [5.0000e+00, 2.3208e-01, 1.0772e-02],
        [6.0000e+00, 2.7850e-01, 1.2927e-02],
        [7.0000e+00, 3.2491e-01, 1.5081e-02],
        [8.0000e+00, 3.7133e-01, 1.7235e-02],
        [9.0000e+00, 4.1774e-01, 1.9390e-02]])
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0000,  0.0464,  0.0000,  0.0022,  0.0000],
        [ 0.9093,  0.0000,  0.0927,  0.0000,  0.0043,  0.0000],
        [ 0.1411,  0.0000,  0.1388,  0.0000,  0.0065,  0.0000],
        [-0.7568,  0.0000,  0.1846,  0.0000,  0.0086,  0.0000],
        [-0.9589,  0.0000,  0.2300,  0.0000,  0.0108,  0.0000],
        [-0.2794,  0.0000,  0.2749,  0.0000,  0.0129,  0.0000],
        [ 0.6570,  0.0000,  0.3192,  0.0000,  0.0151,  0.0000],
        [ 0.9894,  0.0000, 

In [14]:
pow(10000.0, torch.arange(0, 6, 2).float() / 6)

tensor([  1.0000,  21.5443, 464.1590])

### 4、Pad Mask

In [20]:
# 为enc_input和dec_input做一个mask，把占位符P的token（就是0） mask掉
# 返回一个[batch_size, len_q, len_k]大小的布尔张量，True是需要mask掉的位置
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # seq_k.data.eq(0)返回一个等大的布尔张量，seq_k元素等于0的位置为True,否则为False
    # unsqueeze(1)是为了在最后增加一个维度，变成[batch_size, 1, len_k]
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
    # 为每一个q提供一份k, 所以要把第二维度扩展q次
    # 另注意expand并非真正加倍了内存，只是重复了引用，对任意引用的修改都会修改原始值
    return pad_attn_mask.expand(batch_size, len_q, len_k)    
    

In [67]:
enc_inputs.data.eq(0).unsqueeze(1).expand(2, 6, 5)


tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]],

        [[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])

In [66]:
enc_inputs.data.eq(0).unsqueeze(1)

tensor([[[False, False, False, False,  True]],

        [[False, False, False, False,  True]]])

In [20]:
a = torch.arange(0, 12).view(3, 4)

In [27]:
a.unsqueeze(1)

tensor([[[ 0,  1,  2,  3]],

        [[ 4,  5,  6,  7]],

        [[ 8,  9, 10, 11]]])

In [28]:
a.unsqueeze(1).expand(3, 2, 4)

tensor([[[ 0,  1,  2,  3],
         [ 0,  1,  2,  3]],

        [[ 4,  5,  6,  7],
         [ 4,  5,  6,  7]],

        [[ 8,  9, 10, 11],
         [ 8,  9, 10, 11]]])

### 5、Subsequence Mask

In [21]:
# 用于获取对后续位置的掩码，防止在预测过程中看到未来时刻的输入
# 原文：to prevent positions from attending to subsequent positions
def get_attn_subsequence_mask(seq):
    """seq: [batch_size, tgt_len]"""
    # batch_size个 tgt_len * tgt_len的mask矩阵
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    # 生成一个上三角矩阵，对角线以下的元素都是0
    # np.triu 是生成一个 upper triangular matrix 上三角矩阵，k是相对于主对角线的偏移量
    # k=1意为不包含主对角线（从主对角线向上偏移1开始）    
    subsequence_mask = np.triu(np.ones(attn_shape), k=1)
    subsequence_mask = torch.from_numpy(subsequence_mask).byte() # 因为只有0、1所以用byte节省内存
    return subsequence_mask

### 6、ScaledDotProductAttention

> 用于计算缩放点积注意力，在MultiHeadAttention中被调用

In [22]:
class ScaledDotProductionAttention(torch.nn.Module):
    def __intit__(self):
        super(ScaledDotProductionAttention, self).__init__()
    
    def forward(self, Q, K, V, attn_mask):
        '''
        Q: [batch_size, n_heads, len_q, d_k]
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v] 全文两处用到注意力，一处是self attention，另一处是co attention，前者不必说，后者的k和v都是encoder的输出，所以k和v的形状总是相同的
        attn_mask: [batch_size, n_heads, seq_len, seq_len]
        '''
        # 1) 计算注意力分数Q*K^T/sqrt(d_k)
        scores =  torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores: [batch_size, n_heads, len_q, len_k]
        # 2) mask掉padding的token，进行softmax
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
        attn = torch.nn.Softmax(dim=-1)(scores) # attn: [batch_size, n_heads, len_q, len_k]
        # 3) 对V加权求和
        context = torch.matmul(attn, V) # context: [batch_size, n_heads, len_q, d_v]
        return context

In [37]:
ll = torch.arange(0, 12, dtype=torch.float32).view(3, 4)
ll

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])

In [42]:
torch.nn.Softmax(dim=-1)(ll)

tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]])

In [43]:
a = torch.arange(0, 9).view(3, 3)
b = torch.arange(0, 9).view(3, 3)
print(a, b)

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]) tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])


In [44]:
torch.matmul(a, b)

tensor([[ 15,  18,  21],
        [ 42,  54,  66],
        [ 69,  90, 111]])

### 7、MultiHeadAttention

> 多头注意力的实现，Transformer的核心

In [45]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = torch.nn.Linear(d_model, d_model)
        self.W_K = torch.nn.Linear(d_model, d_model)
        self.W_V = torch.nn.Linear(d_model, d_model)
        self.concat = torch.nn.Linear(d_model, d_model)
        
    def forward(self, input_Q, input_K, input_V, attn_mask):
        '''
        input_Q: [batch_size, len_q, d_model] len_q是作为query的句子的长度，比如enc_inputs（2,5,512）作为输入，那句子长度5就是len_q
        input_K: [batch_size, len_k, d_model]
        input_K: [batch_size, len_v(len_k), d_model]
        attn_mask: [batch_size, seq_len, seq_len]
        '''
        residual, batch_size = input_Q, input_Q.size(0)
        # 1) 线性变换 [batch_size, len_q, d_model] -> [batch_size, n_heads, seq_len, d_k/d_v]
        Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # Q: [batch_size, n_heads, len_q, d_k]
        K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # K: [batch_size, n_heads, len_k, d_k]
        V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # V: [batch_size, n_heads, len_v(=len_k), d_v]
        
        # 2) 计算注意力
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask: [batch_size, n_heads, len_q, len_k]
        context = ScaledDotProductionAttention()(Q, K, V, attn_mask)
        
        # 3) 多头拼接
        context = torch.cat([context[:, i, :, :] for i in range(context.size(1))], dim=-1) # context: [batch_size, len_q, n_heads * d_v]
        output = self.concat(context) # output: [batch_size, len_q, d_model]
        return torch.nn.LayerNorm(d_model)(output + residual) # output: [batch_size, len_q, d_model]
        '''        
        最后的concat部分，网上的大部分实现都采用的是下面这种方式（也是哈佛NLP团队的写法）
        context = context.transpose(1, 2).reshape(batch_size, -1, d_model)
        output = self.linear(context)
        但是我认为这种方式拼回去会使原来的位置乱序，于是并未采用这种写法，两种写法最终的实验结果是相近的
        '''
           

### 8、FeedForward Networks
对应Feed Forward和Add & Norm

In [24]:
class PositionwiseFeedForward(torch.nn.Module):
    def __init__(self):
        super(PositionwiseFeedForward, self).__init__()
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(d_model, d_ff, bias=False),
            torch.nn.ReLU(),
            torch.nn.Linear(d_ff, d_model, bias=False)
        )
        
    def forward(self, inputs):
        '''inputs: [batch_size, seq_len, d_model]'''
        residual = inputs
        output = self.fc(inputs)
        return torch.nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]

### 9、Encoder Layer
包含一个MultiHeadAttention和一个FFN

In [25]:
class EncoderLayer(torch.nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PositionwiseFeedForward()
    
    def forward(self, enc_inputs, enc_self_attn_mask):
        '''
        enc_inputs: [batch_size, src_len, d_model]
        enc_self_attn_mask: [batch_size, src_len, src_len] 
        '''
        # Q、K、V都是enc_inputs
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        return enc_outputs # enc_outputs: [batch_size, src_len, d_model]

### 10、Encoder
包含一个源序列词向量嵌入nn.Embedding、一个位置编码PositionalEncoding和6个Encoder Layer

In [39]:
class Encoder(torch.nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = torch.nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = torch.nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        
    def forward(self, enc_inputs):
        '''enc_inputs: [batch_size, src_len]'''
        enc_outputs = self.src_emb(enc_inputs) # enc_outputs: [batch_size, src_len, d_model]
        enc_outputs = self.pos_emb(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        # Encoder中是self attention, 所以传入的Q、K都是enc_inputs
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) # enc_self_attn_mask: [batch_size, src_len, src_len]
        for layer in self.layers:
            enc_outputs = layer(enc_outputs, enc_self_attn_mask)
        return enc_outputs # enc_outputs: [batch_size, src_len, d_model]

### 11、DecoderLayer
包含两个MultiHeadAttention和一个FFN

In [27]:
class DecoderLayer(torch.nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PositionwiseFeedForward()
        
    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        '''
        dec_inputs: [batch_size, tgt_len, d_model]
        enc_outputs: [batch_size, src_len, d_model]
        dec_self_attn_mask: [batch_size, tgt_len, tgt_len]
        dec_enc_attn_mask: [batch_size, tgt_len, src_len]
        '''
        dec_outputs = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask) # dec_outputs: [batch_size, tgt_len, d_model]
        #!!! Decoder中的Q是dec_outputs, K和V是enc_outputs
        dec_outputs = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs) # dec_outputs: [batch_size, tgt_len, d_model]
        
        return dec_outputs # dec_outputs: [batch_size, tgt_len, d_model]

### 12、Decoder
包含一个目标序列词向量序列嵌入nn.Embeding、一个位置编码PositionEncoding和6个Decoder Layer

In [84]:
class Decoder(torch.nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = torch.nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = torch.nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
    
    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        '''
        dec_inputs: [batch_size, tgt_len]
        enc_inputs: [batch_size, src_len]
        enc_outputs: [batch_size, src_len, d_model]
        '''
        dec_outputs = self.tgt_emb(dec_inputs)
        print(dec_outputs.size())
        dec_outputs = self.pos_emb(dec_outputs)
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs) # dec_self_attn_pad_mask: [batch_size, tgt_len, tgt_len]
        print('dec_self_attn_pad_mask:', dec_self_attn_pad_mask)
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs)
        print('dec_self_attn_subsequence_mask:', dec_self_attn_subsequence_mask)
        # 将两个mask叠加，布尔值可以视为0和1，和大于0的位置是需要被mask掉的，赋为True，和为0的位置是有意义的为False
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + 
                                       dec_self_attn_subsequence_mask), 0)
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # dec_enc_attn_mask: [batch_size, tgt_len, src_len]
        print('dec_enc_attn_mask:', dec_enc_attn_mask.size())
        for layer in self.layers:
            dec_outputs = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
        return dec_outputs # dec_outputs: [batch_size, tgt_len, d_model]   
        

### 13、Transformer
包含一个Encoder、一个Decoder、一个nn.Linear

In [29]:
class Transformer(torch.nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.projection = torch.nn.Linear(d_model, tgt_vocab_size, bias=False)
        
    def forward(self, enc_inputs, dec_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        '''
        # 1) Encoder
        enc_outputs = self.encoder(enc_inputs)
        # 2) Decoder
        dec_outputs = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        dec_logits = self.projection(dec_outputs) # dec_logits: [batch_size, tgt_len, tgt_vocab_size]
        
        return dec_logits.view(-1, dec_logits.size(-1)) # [batch_size * tgt_len, tgt_vocab_size]
        '''最后变形的原因是：nn.CrossEntropyLoss接收的输入的第二个维度必须是类别'''

### 14、训练

In [46]:
model = Transformer()
model.train()
# 损失函数，忽略为0的类别不对其计算loss（因为是padding）
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0/99)

# 训练
for epoch in range(1000):
    for enc_inputs, dec_inputs, dec_outputs in loader:
        '''
        enc_inputs: [batch_size, src_len] [2, 5]
        dec_inputs: [batch_size, tgt_len] [2, 6]
        dec_outputs: [batch_size, tgt_len] [2, 6]
        '''
        outputs = model(enc_inputs, dec_inputs) # outputs: [batch_size * tgt_len, tgt_vocab_size]
        # outputs: [batch_size * tgt_len, tgt_vocab_size], dec_outputs: [batch_size, tgt_len]
        loss = criterion(outputs, dec_outputs.view(-1))
        
        # 更新权重
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f'Epoch [{epoch+1}/1000], Loss: {loss.item():.4f}')
torch.save(model, 'MyTransformer.pth')

Epoch [1/1000], Loss: 2.1365
Epoch [2/1000], Loss: 2.0420
Epoch [3/1000], Loss: 1.9409
Epoch [4/1000], Loss: 1.8821
Epoch [5/1000], Loss: 1.7687
Epoch [6/1000], Loss: 1.7895
Epoch [7/1000], Loss: 1.5833
Epoch [8/1000], Loss: 1.6049
Epoch [9/1000], Loss: 1.4607
Epoch [10/1000], Loss: 1.5277
Epoch [11/1000], Loss: 1.4173
Epoch [12/1000], Loss: 1.3674
Epoch [13/1000], Loss: 1.3935
Epoch [14/1000], Loss: 1.2539
Epoch [15/1000], Loss: 1.1713
Epoch [16/1000], Loss: 1.1932
Epoch [17/1000], Loss: 1.1360
Epoch [18/1000], Loss: 1.0958
Epoch [19/1000], Loss: 1.0440
Epoch [20/1000], Loss: 1.0631
Epoch [21/1000], Loss: 0.9475
Epoch [22/1000], Loss: 0.9325
Epoch [23/1000], Loss: 0.9326
Epoch [24/1000], Loss: 0.8440
Epoch [25/1000], Loss: 0.8124
Epoch [26/1000], Loss: 0.8152
Epoch [27/1000], Loss: 0.8228
Epoch [28/1000], Loss: 0.8000
Epoch [29/1000], Loss: 0.7401
Epoch [30/1000], Loss: 0.6732
Epoch [31/1000], Loss: 0.7317
Epoch [32/1000], Loss: 0.6956
Epoch [33/1000], Loss: 0.6570
Epoch [34/1000], Lo

### 15、测试

In [90]:
# 原文使用的是大小为4的beam search，这里为简单起见使用更简单的greedy贪心策略生成预测，不考虑候选，每一步选择概率最大的作为输出
# 如果不使用greedy_decoder，那么我们之前实现的model只会进行一次预测得到['i']，并不会自回归，所以我们利用编写好的Encoder-Decoder来手动实现自回归（把上一次Decoder的输出作为下一次的输入，直到预测出终止符）
def greedy_decoder(model, enc_input, start_symbol):
    '''
    enc_input: [batch_size, src_len] [1, seq_len] 对应一句话
    '''
    enc_outputs = model.encoder(enc_input) # enc_outputs: [1, src_len, 512]
    # 生成一个1行0列的，和enc_inputs.data类型相同的空张量，待后续填充
    dec_input = torch.zeros(1, 0).type_as(enc_input.data) # dec_input: [1, 0]
    # start_symbol是S，即开始标记
    next_symbol = start_symbol
    flag = True
    while flag:
        # dec_input.detach() 创建 dec_input 的一个分离副本
        # 生成了一个 只含有next_symbol的（1,1）的张量
        # -1 表示在最后一个维度上进行拼接cat
        # 这行代码的作用是将next_symbol拼接到dec_input中，作为新一轮decoder的输入
        dec_input = torch.cat([dec_input.detach(), torch.tensor([[next_symbol]], dtype=enc_input.dtype)], dim=-1) # dec_input: [1,当前词数]
        print(dec_input.shape)
        dec_outputs = model.decoder(dec_input, enc_input, enc_outputs) # dec_outputs: [1, tgt_len, 512]
        print('dec_outputs', dec_outputs.size())
        projected = model.projection(dec_outputs) # projected: [1, 当前生成的tgt_len, tgt_vocab_size]
        print('projected', projected.size())
        # max返回的是一个元组（最大值，最大值对应的索引），所以用[1]取到最大值对应的索引, 索引就是类别，即预测出的下一个词
        # keepdim为False会导致减少一维
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1] # prob: [1]
        # prob是一个一维的列表，包含目前为止依次生成的词的索引，最后一个是新生成的（即下一个词的类别）
        # 因为注意力是依照前面的词算出来的，所以后生成的不会改变之前生成的
        next_symbol = prob.data[-1]
        if next_symbol == tgt_vocab['.']:
            flag = False        
        print(next_symbol)
    return dec_input    # dec_input: [1, tgt_len]

In [91]:
model = torch.load('MyTransformer.pth')
model.eval()

with torch.no_grad():
    enc_inputs, _, _ = next(iter(loader))
    for i in range(len(enc_inputs)):
        greedy_dec_input = greedy_decoder(model, enc_inputs[i].view(1, -1), tgt_vocab['S'])
        # predict = model(enc_inputs[i].view(1, -1), greedy_dec_input)
        predict  = model(enc_inputs[i].view(1, -1), greedy_dec_input) # predict: [batch_size * tgt_len, tgt_vocab_size]
        predict = predict.data.max(dim=-1, keepdim=False)[1]

        print('greedy_dec_input:', greedy_dec_input)
        print('predict:', predict)
        # print(enc_inputs[i], '->', [idx2word[n.item()] for n in predict])

torch.Size([1, 1])
torch.Size([1, 1, 512])
dec_self_attn_pad_mask: tensor([[[False]]])
dec_self_attn_subsequence_mask: tensor([[[0]]], dtype=torch.uint8)
dec_enc_attn_mask: torch.Size([1, 1, 5])
dec_outputs torch.Size([1, 1, 512])
projected torch.Size([1, 1, 9])
tensor(1)
torch.Size([1, 2])
torch.Size([1, 2, 512])
dec_self_attn_pad_mask: tensor([[[False, False],
         [False, False]]])
dec_self_attn_subsequence_mask: tensor([[[0, 1],
         [0, 0]]], dtype=torch.uint8)
dec_enc_attn_mask: torch.Size([1, 2, 5])
dec_outputs torch.Size([1, 2, 512])
projected torch.Size([1, 2, 9])
tensor(2)
torch.Size([1, 3])
torch.Size([1, 3, 512])
dec_self_attn_pad_mask: tensor([[[False, False, False],
         [False, False, False],
         [False, False, False]]])
dec_self_attn_subsequence_mask: tensor([[[0, 1, 1],
         [0, 0, 1],
         [0, 0, 0]]], dtype=torch.uint8)
dec_enc_attn_mask: torch.Size([1, 3, 5])
dec_outputs torch.Size([1, 3, 512])
projected torch.Size([1, 3, 9])
tensor(3)
torch

In [51]:
for i in range(len(enc_inputs)):
    enc_outputs = model.encoder(enc_inputs[i].view(1, -1))
    print(enc_outputs.shape)

torch.Size([1, 5, 512])
torch.Size([1, 5, 512])


In [70]:
torch.nn.Embedding(10, d_model)(dec_inputs[0].view(1, -1)).shape

torch.Size([1, 6, 512])

In [58]:
enc_inputs

tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]])