#### 怎么用 Pytorch 实现一个完整的 Transformer 模型？
- Tokenize  
- Input Embedding  
- Positional Encoder  
- Transformer Block  
- Encoder  
- Decoder  
- Transformer  

Tokenizer

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [5]:
class Tokenize(object):
    
    def __init__(self, lang):
        self.nlp = importlib.import_module(lang).load()
            
    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
        

Input Embedding

In [10]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embed(x)

Positional Encode

In [26]:
import math
class Positional_Encode(nn.Module):
    def __init__(self, d_model, sen_len=80):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(sen_len, d_model)
        
        for pos in range(sen_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        #让token embeddings大一些，
        #这样pos embedding就相对小一些,这样可以减少token embeddings 的方差
        x = x * math.sqrt(self.d_model)
        
        seq_len = x.shape(1)
        x = x + Variable(self.pe[:, :seq_len], 
                        requires_grad=False).cuda()
        return x
                

<font color=black size=3 face=雅黑>**Transformer Block**</font>

有了输入，我们接下来就要开始构建 Transformer Block 了，Transformer Block 主要是有以下4个部分构成的：

- self-attention layer
- normalization layer
- feed forward layer
- another normalization layer

In [8]:
a = torch.randn(3,5)
a

tensor([[-0.5316,  0.2449, -0.5929,  0.0301, -0.8545],
        [ 0.8579, -0.7765,  0.9545, -0.2580,  0.5755],
        [ 1.2618,  0.4861, -1.2943, -0.4913,  0.3012]])

In [113]:
#  self-attention layer
class Multihead_attention(nn.Module):
    def __inif__(self, heads, d_model, dropout=0.1):
        super().__inif__()
        
        self.h = heads
        self.d_model = d_model
        self.d_k = d_model // heads
        
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
        
        
        
    def attention(self, q, k, v, d_k, mask=None, dropout=None):
        sen_len = q.shape[2]
        scores = q.matmul(k.transpose(-2,-1))/math.sqrt(d_k)
        # scores: [bs, heads, sen_len, sen_len]
        
        # mask掉那些为了padding长度增加的token，让其通过softmax计算后为0
        if mask is not None:
            # mask: [bs, sen_len]
            #mask = mask.unsqueeze(1).repeat(1, sen_len, 1).unsqueeze(1)# mask: [bs, 1, sen_len, sen_len]
            # 可以直接利用广播机制，这里是self-attention的mask，
            # 所以每个时刻都可以attend到所有其它时刻，所有第三维也是1，也使用broadcasting。
            # 如果是普通的mask，那么mask的shape是(bs 1, sen_len, sen_len)。
            mask = mask.unsqueeze(1).unsqueeze(1) # mask: [bs, 1, 1, sen_len]
            
            scores = scores.masked_fill(mask==0, -1e9)
        weight = F.softmax(scores, dim=-1)
        if dropout is not None:
            weight = dropout(weight)
        output = weight.matmul(v)
        return output      
    
    def forward(self, q, k, v, mask=None):
        bs = q.shape[0]
        
        # transpose to get dimensions bs * N * sl * d_model
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k).transpose(1,2)
        k = self.q_linear(k).view(bs, -1, self.h, self.d_k).transpose(1,2)
        v = self.q_linear(v).view(bs, -1, self.h, self.d_k).transpose(1,2)
        
        attentioned = self.attention(q, k, v, self.d_k, mask, self.dropout)
        
        concat = attentioned.transpose(1,2).contiguous().view(bs, -1, self.model)
        output = self.out(concat)
        return output
        

In [121]:
#  Layer Norm
class NormLayer(nn.Module):
    '''layer normalization可以将数据分布拉到激活函数的非饱和区，
    具有权重/数据伸缩不变性的特点。
    起到缓解梯度消失/爆炸、加速训练、正则化的效果。'''
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.size = d_model
        # 使用两个可以学习的参数来进行 normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
        
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [125]:
# Feed Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        
        self.ffn = nn.Sequencial(
            nn.Linear(d_model, d_ff),
            nn.Gelu(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        
    def forward(self, x, mask):
        return self.ffn(x)

Encoder

In [None]:
class Encoder_block(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm1 = NormLayer(d_model)
        self.norm2 = NormLayer(d_model)
        
        self.attention = Multihead_attention(heads, d_model)
        self.ffn = FeedForward(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
    def forward(self, x):
        z = self.dropout1(self.attention(x, x, x, mask))
        x = self.norm1(z + x)
        z = self.dropout2(self.ffn(x))
        out = self.norm2(z + x)
        return out
    
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
#         self.tokenizer = Tokenize('cn')
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_encoder = Positional_Encode(d_model)
        
        self.layers = self.get_clones(Encoder_block(d_model, heads), N)
        self.norm = NormLayer(d_model)
        
    def get_clones(module, N):
        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
    
    def forward(self, src, mask):
        x = self.embedding(src)
        x = self.pos_encoder(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)      
        

Decoder

In [None]:
class Decoder_block(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm1 = NormLayer(d_model)
        self.norm2 = NormLayer(d_model)
        self.norm3 = NormLayer(d_model)
        
        self.attention1 = Multihead_attention(heads, d_model)
        self.attention2 = Multihead_attention(heads, d_model)
        self.ffn = FeedForward(d_model)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, e_outputs, src_mask, trg_mask):
        z = self.dropout1(self.attention1(x, x, x, trg_mask))
        x = self.norm1(z + x)
        z = self.dropout2(self.attention2(x, e_outputs, e_outputs, src_mask))
        x = self.norm2(z + x)
        return self.norm3(self.dropout3(x))
        
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout)
     super().__init__()
        self.N = N
#         self.tokenizer = Tokenize('cn')
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_encoder = Positional_Encode(d_model)
        
        self.layers = self.get_clones(Decoder_block(d_model, heads), N)
        self.norm = NormLayer(d_model)
        
    def get_clones(module, N):
        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
    
    def forward(self, trg, e_outpout, src_mask, trg_mask):
        x = self.embedding(src)
        x = self.pos_encoder(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)   
        

Transformer

In [1]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)
        
    def forward(self, src, trg, src_mask, trg_mask):
        e_output = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_output, src_mask, trg_mask)
        return self.out(d_outpout)

NameError: name 'nn' is not defined

In [21]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [22]:
subsequent_mask(5)

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]])

In [None]:
def make_model(src_vocab, trg_vocab, d_model=512, N=6, heads=8, dropout=0.1):
    pass