# 1 - 数据预处理

In [22]:
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
from torch.autograd import Variable

# S: Symbol that shows starting of decoding input
# E: Symbol that shows end of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps
sentences = [
        # enc_input           dec_input         dec_output
        ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
        ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
]

# Padding Should be Zero
src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5}
src_vocab_size = len(src_vocab)

tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
idx2word = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)

src_len = 5 # enc_input max sequence length
tgt_len = 6 # dec_input(=dec_output) max sequence length

def make_data(sentences):
    enc_inputs, dec_inputs, dec_outputs = [], [], []
    for i in range(len(sentences)):       
      enc_input = [src_vocab[n] for n in sentences[i][0].split()]
      dec_input = [tgt_vocab[n] for n in sentences[i][1].split()]
      dec_output = [tgt_vocab[n] for n in sentences[i][2].split()]
      enc_inputs.append(enc_input)
      dec_inputs.append(dec_input)
      dec_outputs.append(dec_output)
    return torch.LongTensor(enc_inputs), torch.LongTensor(dec_inputs), torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = make_data(sentences)

class MyDataSet(Data.Dataset):
  def __init__(self, enc_inputs, dec_inputs, dec_outputs):
    super(MyDataSet, self).__init__()
    self.enc_inputs = enc_inputs
    self.dec_inputs = dec_inputs
    self.dec_outputs = dec_outputs
  
  def __len__(self):
    return self.enc_inputs.shape[0]
  
  def __getitem__(self, idx):
    return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]

loader = Data.DataLoader(MyDataSet(enc_inputs, dec_inputs, dec_outputs), 1, True)

In [23]:
for enc_inputs, dec_inputs, dec_outputs in loader:
  print(enc_inputs, dec_inputs, dec_outputs)

tensor([[1, 2, 3, 4, 0]]) tensor([[6, 1, 2, 3, 4, 8]]) tensor([[1, 2, 3, 4, 8, 7]])
tensor([[1, 2, 3, 5, 0]]) tensor([[6, 1, 2, 3, 5, 8]]) tensor([[1, 2, 3, 5, 8, 7]])


# 2 - PositionalEncoding

In [24]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return self.dropout(x)

In [25]:
pe = PositionalEncoding(4)
a = torch.zeros(1, 2, 4)
print (f'Before PE : {a}\n')
b = pe(a)
print (f'After PE : {b}\n')

Before PE : tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

After PE : tensor([[[0.0000, 1.1111, 0.0000, 1.1111],
         [0.9350, 0.0000, 0.0111, 1.1111]]])



# 3 - Mask

In [26]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # [batch_size, len_q, len_k]

In [27]:
seq_k = torch.tensor([[1, 2, 3, 5, 0]])
print(get_attn_pad_mask(seq_k, seq_k))
seq_q = torch.tensor([[6, 1, 2, 3, 5, 8]])
print(get_attn_pad_mask(seq_q, seq_k))

tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])
tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])


In [28]:
def get_attn_subsequence_mask(seq):
    '''
    seq: [batch_size, tgt_len]
    '''
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequence_mask = np.triu(np.ones(attn_shape), k=1) # Upper triangular matrix
    subsequence_mask = torch.from_numpy(subsequence_mask).byte()
    return subsequence_mask # [batch_size, tgt_len, tgt_len]

In [29]:
seq = torch.tensor([[1, 2, 3, 5, 0]])
a=get_attn_subsequence_mask(seq)
print(a)
b=get_attn_pad_mask(seq, seq)
print(b)
c = torch.gt((a + b), 0) #Decoder 中不仅要把 "pad"mask 掉，还要 mask 未来时刻的信息。
print(c)

tensor([[[0, 1, 1, 1, 1],
         [0, 0, 1, 1, 1],
         [0, 0, 0, 1, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0]]], dtype=torch.uint8)
tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])
tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])


# 4 - Attention

In [41]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask):
        residual, bs = q, q.size(0)  
        # perform linear operation and split into h heads    
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * seq_len * d_k      
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        #print(f'k shape -{k.shape}')
        #print('-------------')
        #print(f'q shape -{q.shape}')
        #print('-------------')
        #print(f'v shape -{v.shape}')
        #print('-------------')

        mask = mask.unsqueeze(1).repeat(1, self.h, 1, 1)
        #print(f'mask shape -{mask.shape}, mask = {mask}\n')
        #print('-------------')

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).reshape(bs, -1, self.d_model)
        #print(f'concat shape -{concat.shape}\n')
        #print('-------------')        
        output = self.out(concat)
        output = nn.LayerNorm(self.d_model)(output + residual)
        return output

In [42]:
import torch.nn.functional as F

def attention(q, k, v, d_k, mask, dropout=None):    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    #print(f'scores shape -{scores.shape}\n')
    #print('-------------')

    scores.masked_fill_(mask, -1e9)
    #print(f'masked scores shape -{scores.shape}, masked scores = {scores}\n')
    #print('-------------')

    scores = F.softmax(scores, dim=-1)
    #print(f'softmax scores shape -{scores.shape}, softmax scores = {scores}\n')
    #print('-------------')

    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    #print(f'attention output shape -{output.shape}\n')
    #print('-------------') 
    return output

# 5 - 全连接前馈网络

In [32]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.fc = nn.Sequential(
            nn.Linear(self.d_model, self.d_ff),
            nn.ReLU(),
            nn.Linear(self.d_ff, self.d_model)
        )
    def forward(self, inputs):
        '''
        inputs: [batch_size, seq_len, d_model]
        '''
        residual = inputs
        output = self.fc(inputs)
        return nn.LayerNorm(self.d_model)(output + residual) # [batch_size, seq_len, d_model]

# 6 - Encoder部分数据流动过程

In [33]:
tmp_d_model = 4
tmp_n_heads = 2
tmp_d_ff = 6
torch.manual_seed(1) 

#enc_inputs：[batch_size, seq_len]
enc_inputs, dec_inputs, dec_outputs = next(iter(loader))
print (f'enc_inputs = {enc_inputs}\n')

#转换为词向量形式 enc_outputs：[batch_size, seq_len, d_model]
enc_outputs = nn.Embedding(src_vocab_size, tmp_d_model)(enc_inputs)

#词向量基础上+位置向量 enc_outputs：[batch_size, seq_len, d_model]
enc_outputs = PositionalEncoding(tmp_d_model)(enc_outputs)
print (f'enc_outputs shape = {enc_outputs.shape}\n')

#pad mask
enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) 
print (f'enc_self_attn_mask shape - {enc_self_attn_mask.shape}, enc_self_attn_mask = {enc_self_attn_mask}\n')

#Attention
att_test = MultiHeadAttention(tmp_n_heads, tmp_d_model)
out = att_test(enc_outputs, enc_outputs, enc_outputs, enc_self_attn_mask)

#FFN
ffn_test = PoswiseFeedForwardNet(tmp_d_model, tmp_d_ff)
#最终的输出向量 out：[batch_size, seq_len, d_model]
out = ffn_test(out)
print (f'fnn output shape - {out.shape}\n')

enc_inputs = tensor([[1, 2, 3, 5, 0]])

enc_outputs shape = torch.Size([1, 5, 4])

enc_self_attn_mask shape - torch.Size([1, 5, 5]), enc_self_attn_mask = tensor([[[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True]]])

k shape -torch.Size([1, 2, 5, 2])
-------------
q shape -torch.Size([1, 2, 5, 2])
-------------
v shape -torch.Size([1, 2, 5, 2])
-------------
mask shape -torch.Size([1, 2, 5, 5]), mask = tensor([[[[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True]],

         [[False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],
          [False, False, False, False,  True],

# 7 - Encoder

Encoder 一共有 N=6 个 encoderlayer 堆叠在一起，每个 encoderlayer 由两个子层组成。第一个子层实现了“多头”的 Encoder Self-attention，对应如上的 MultiHeadAttention，第二个子层则是个简单的 Position-wise 的全连接前馈网络，对应如上的 PoswiseFeedForwardNet。

In [34]:
class EncoderLayer(nn.Module):
    def __init__(self, heads, d_model, d_ff):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(heads, d_model)
        self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff)

    def forward(self, enc_inputs, enc_self_attn_mask):
        '''
        enc_inputs: [batch_size, src_len, d_model]
        enc_self_attn_mask: [batch_size, src_len, src_len]
        '''
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)# enc_outputs: [batch_size, src_len, d_model]
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        return enc_outputs

最终的 Encoder 就是这 N=6 个 encoderlayer 堆叠在一起

In [35]:
class Encoder(nn.Module):
    def __init__(self, src_vocab_size, heads, d_model, d_ff, n_layers):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(heads, d_model, d_ff) for _ in range(n_layers)])

    def forward(self, enc_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        '''
        enc_outputs = self.src_emb(enc_inputs) # [batch_size, src_len, d_model]
        enc_outputs = self.pos_emb(enc_outputs) # [batch_size, src_len, d_model]
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) # [batch_size, src_len, src_len]
        for layer in self.layers:
            # enc_outputs: [batch_size, src_len, d_model]
            enc_outputs = layer(enc_outputs, enc_self_attn_mask)
        return enc_outputs

# 8 - Decoder

相比较于 Encoder, Decoder 多经过了一层 Decoder-Encoder Attention 的计算， 并且在计算 Decoder Self-attention 的时候不仅要把 "pad" mask 掉，还要 mask 未来时刻的信息。

In [37]:
class DecoderLayer(nn.Module):
    def __init__(self, heads, d_model, d_ff):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention(heads, d_model)
        self.dec_enc_attn = MultiHeadAttention(heads, d_model)
        self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff)

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        '''
        dec_inputs: [batch_size, tgt_len, d_model]
        enc_outputs: [batch_size, src_len, d_model]
        dec_self_attn_mask: [batch_size, tgt_len, tgt_len]
        dec_enc_attn_mask: [batch_size, tgt_len, src_len]
        '''
        # dec_outputs: [batch_size, tgt_len, d_model]
        dec_outputs = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        # dec_outputs: [batch_size, tgt_len, d_model]
        dec_outputs = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs) # [batch_size, tgt_len, d_model]
        return dec_outputs

In [38]:
class Decoder(nn.Module):
    def __init__(self, tgt_vocab_size, heads, d_model, d_ff, n_layers):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer(heads, d_model, d_ff) for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        '''
        dec_inputs: [batch_size, tgt_len]
        enc_inputs: [batch_size, src_len]
        enc_outputs: [batsh_size, src_len, d_model]
        '''
        dec_outputs = self.tgt_emb(dec_inputs) # [batch_size, tgt_len, d_model]
        dec_outputs = self.pos_emb(dec_outputs) # [batch_size, tgt_len, d_model]
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs) # [batch_size, tgt_len, tgt_len]
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs) # [batch_size, tgt_len, tgt_len]
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequence_mask), 0) # [batch_size, tgt_len, tgt_len]

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # [batc_size, tgt_len, src_len]

        for layer in self.layers:
            # dec_outputs: [batch_size, tgt_len, d_model]
            dec_outputs = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
        return dec_outputs

# 8 - Transformer

In [39]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, heads, d_model, d_ff, n_layers):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, heads, d_model, d_ff, n_layers)
        self.decoder = Decoder(tgt_vocab_size, heads, d_model, d_ff, n_layers)
        self.projection = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, enc_inputs, dec_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        '''
        # enc_outputs: [batch_size, src_len, d_model]
        enc_outputs = self.encoder(enc_inputs)
        # dec_outpus: [batch_size, tgt_len, d_model]
        dec_outputs = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        # dec_logits: [batch_size, tgt_len, tgt_vocab_size]
        dec_logits = self.projection(dec_outputs)
        return dec_logits.view(-1, dec_logits.size(-1))

# 9 - 训练

In [43]:
d_model = 256
n_heads = 8
d_ff = 2048
n_layers = 6
src_vocab_size = 6
tgt_vocab_size = 9

model = Transformer(src_vocab_size, tgt_vocab_size, n_heads, d_model, d_ff, n_layers)
#for p in model.parameters():
#    if p.dim() > 1:
#        nn.init.xavier_uniform_(p)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)

for epoch in range(30):
    for enc_inputs, dec_inputs, dec_outputs in loader:
      '''
      enc_inputs: [batch_size, src_len]
      dec_inputs: [batch_size, tgt_len]
      dec_outputs: [batch_size, tgt_len]
      '''
      enc_inputs, dec_inputs, dec_outputs = enc_inputs, dec_inputs, dec_outputs
      # outputs: [batch_size * tgt_len, tgt_vocab_size]
      outputs = model(enc_inputs, dec_inputs)
      loss = criterion(outputs, dec_outputs.view(-1))
      print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0001 loss = 2.585765
Epoch: 0001 loss = 2.383128
Epoch: 0002 loss = 2.237804
Epoch: 0002 loss = 1.937003
Epoch: 0003 loss = 1.893852
Epoch: 0003 loss = 1.589763
Epoch: 0004 loss = 1.499871
Epoch: 0004 loss = 1.345889
Epoch: 0005 loss = 1.184624
Epoch: 0005 loss = 1.183304
Epoch: 0006 loss = 0.806398
Epoch: 0006 loss = 0.864008
Epoch: 0007 loss = 0.704574
Epoch: 0007 loss = 0.486642
Epoch: 0008 loss = 0.502043
Epoch: 0008 loss = 0.264302
Epoch: 0009 loss = 0.258390
Epoch: 0009 loss = 0.302220
Epoch: 0010 loss = 0.180751
Epoch: 0010 loss = 0.235268
Epoch: 0011 loss = 0.178867
Epoch: 0011 loss = 0.122559
Epoch: 0012 loss = 0.147611
Epoch: 0012 loss = 0.138665
Epoch: 0013 loss = 0.115273
Epoch: 0013 loss = 0.098724
Epoch: 0014 loss = 0.078117
Epoch: 0014 loss = 0.066898
Epoch: 0015 loss = 0.072905
Epoch: 0015 loss = 0.085518
Epoch: 0016 loss = 0.136289
Epoch: 0016 loss = 0.055864
Epoch: 0017 loss = 0.083160
Epoch: 0017 loss = 0.043318
Epoch: 0018 loss = 0.056979
Epoch: 0018 loss = 0

# 10 - 预测

In [72]:
def greedy_decoder(model, enc_input, tgt_len = 6, idx2word=idx2word, start_symbol=tgt_vocab["S"]):
    """
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol.
    :return: The target input
    """
    enc_outputs = model.encoder(enc_input)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.LongTensor)
    out = []
    for i in range(0, tgt_len):
        dec_outputs = model.decoder(ys, enc_input, enc_outputs)
        projected = model.projection(dec_outputs).squeeze(0)[-1]
        #print(projected)
        ind = torch.max(projected, dim=-1)[1].data.item()
        #print(ind)
        #print('----------')
        
        next_word = idx2word[ind]
        if next_word == 'E':
            break
        
        out.append(next_word)
        ys = torch.cat([ys, torch.ones(1, 1).fill_(ind).type(torch.LongTensor)], dim=1)
        #print(out)
        #print(ys)
        #print('===========')
    return ' '.join(out)

# Test
enc_inputs, _, _ = next(iter(loader))
greedy_dec_input = greedy_decoder(model, enc_inputs[0].view(1, -1))
print(greedy_dec_input)

i want a coke .
