In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import activations
from tensorflow.keras.models import Model, load_model
import numpy as np

print(tf.__version__)

2.0.0


### Transformer模型
Transformer的整体结构是由点乘自注意力、全连接层堆叠而成的编码器和解码器。
![Transformer](./image/transformer_001.png)

In [5]:
# padding填充mask
def padding_mask(seq):
    mask = tf.math.not_equal(seq, 0)
    return mask

# decode mask
def look_ahead_mask(size):
    ahead_mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    ahead_mask = tf.cast(ahead_mask, dtype=tf.bool)
    return ahead_mask

#### 位置编码信息
transformer模型不同与RNN模型，RNN天然就有位置信息，transformer中通过额外输入每个时刻的位置信息。通过sin和cos函数交替生成位置编码信息。

In [8]:
# 位置编码信息
def positional_embedding(maxlen, model_size):
    PE = np.zeros((maxlen, model_size))
    for i in range(maxlen):
        for j in range(model_size):
            if j % 2 == 0:
                PE[i, j] = np.sin(i / 10000 ** (j / model_size))
            else:
                PE[i, j] = np.cos(i / 10000 ** ((j-1) / model_size))
    PE = tf.constant(PE, dtype=tf.float32)
    return PE

####  Attention
注意力函数可以看成是将一个输出向量映射成一个查询向量query和一组键key-值value向量对。输出向量为这些值向量的加权求和，其中每个值向量的权重由查询向量和值对应的键向量计算得出。

我们称其为“量化点乘注意力”， 输入包括d_k维的查询向量和键向量以及d_v维的值向量，最后使用softmax函数获得这些值对应的权重。

![Transformer](./image/transformer_002.png)

#### Multi Head Attention
多头注意力使模型联合感知不同位置的不同特征表征。单个头的注意力会抑制这些表征。
![Transformer](./image/transformer_003.png)

In [9]:
import tensorflow as tf
from tensorflow import keras

class MultiHeadAttention(keras.Model):
    def __init__(self, model_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_size = model_size // num_heads
        self.WQ = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
        self.WK = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
        self.WV = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
        self.WO = keras.layers.Dense(model_size)
    
    def call(self, query, key, value, mask):
        # query shape: (batch, query_len, model_size)
        # key shape: (batch, key_len, model_size)
        # value shape: (batch, value_len, model_size)
        context_heads = []
        for i in range(self.num_heads):
            q = self.WQ[i](query)
            k = self.WK[i](key)
            v = self.WV[i](value)
            matmul_qk = tf.matmul(q, k, transpose_b=True)
            dk = tf.dtypes.cast(self.head_size, tf.float32)
            # 缩放 matmul_qk
            score = matmul_qk / tf.math.sqrt(dk)
            if mask is not None:
                score += (1 - mask) * -1e9
            alpha = tf.nn.softmax(score, axis=-1)
            context = tf.matmul(alpha, v)
            
            context_heads.append(context)
            
        concat_attention = tf.concat(context_heads, axis=2)
        output = self.WO(concat_attention)
            
        return output

这里也可以中tensorflow中keras封装好的Attention层

In [7]:
class MultiHeadAttention(keras.Model):
    def __init__(self, model_size, num_heads, causal=False):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_size = model_size // num_heads
        self.WQ = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
        self.WK = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
        self.WV = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
        self.Attention = keras.layers.Attention(use_scale=True, causal=causal)
        self.WO = keras.layers.Dense(model_size)
    
    def call(self, query, key, value, mask):
        # query shape: (batch, query_len, model_size)
        # key shape: (batch, key_len, model_size)
        # value shape: (batch, value_len, model_size)
        context_heads = []
        for i in range(self.num_heads):
            q = self.WQ[i](query)
            k = self.WK[i](key)
            v = self.WV[i](value)
            context = self.Attention([q, k, v], [mask, mask])
            context_heads.append(context)
            
        concat_attention = tf.concat(context_heads, axis=2)
        output = self.WO(concat_attention)
            
        return output

#### Point wise feed forward network

In [5]:
# Point wise feed forward network
class FeedForwardNetwork(keras.Model):
    def __init__(self, dff_size, model_size):
        super(FeedForwardNetwork, self).__init__()
        self.dense1 = keras.layers.Dense(dff_size, activation="relu")
        self.dense2 = keras.layers.Dense(model_size)
    
    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return x

In [10]:
# Encoder Layer层
class EncoderLayer(keras.layers.Layer):
    def __init__(self, model_size, num_heads, dff_size, rate=0.1):
        super(EncoderLayer, self).__init__()
        
        self.attention = MultiHeadAttention(model_size, num_heads)
        self.ffn = FeedForwardNetwork(dff_size, model_size)
        
        # Layer Normalization
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        # multi head attention
        attn_output = self.attention(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        # residual connection
        out1 = self.layernorm1(x + attn_output)
        # ffn layer
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        # Residual connection
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2

In [11]:
# 多层Encoder
class Encoder(keras.Model):
    def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rate=0.1):
        super(Encoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        
        self.embedding = keras.layers.Embedding(vocab_size, model_size)
        self.pos_embedding = positional_embedding(maxlen, model_size)
        
        self.encoder_layers = [EncoderLayer(model_size,num_heads,dff_size,rate) for _ in range(num_layers)]
        self.dropout = keras.layers.Dropout(rate)
        
    def call(self, x, training, padding_mask):
        seq_len = tf.shape(x)[1]
        # input embedding
        x = self.embedding(x)
        # positional embedding
        x += self.pos_embedding
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.encoder_layers[i](x, training, padding_mask)
        return x

In [12]:
# Decoder Layer
class DecoderLayer(keras.layers.Layer):
    def __init__(self, model_size, num_heads, dff_size, rate=0.1):
        super(DecoderLayer, self).__init__()
        
        self.mask_attention = MultiHeadAttention(model_size, num_heads, causal=True)
        self.attention = MultiHeadAttention(model_size, num_heads)
        self.ffn = FeedForwardNetwork(dff_size, model_size)
        
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = keras.layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, x, enc_output, training, padding_mask):
        attn_decoder = self.mask_attention(x, x, x, padding_mask)
        out1 = self.layernorm1(x + attn_decoder)
        
        attn_encoder_decoder = self.attention(out1, enc_output, enc_output, padding_mask)
        out2 = self.layernorm2(out1 + attn_encoder_decoder)
        
        ffn_output = self.ffn(out2)
        out3 = self.layernorm3(out2 + ffn_output)
        
        return out3

In [9]:
# 多层Decoder
class Decoder(keras.Model):
    def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rate=0.1):
        super(Decoder, self).__init__()
        
        self.model_size = model_size
        self.num_layers = num_layers
        
        self.embedding = keras.layers.Embedding(vocab_size, model_size)
        self.pos_embedding = positional_embedding(maxlen, model_size)
        
        self.decoder_layers = [DecoderLayer(model_size,num_heads,dff_size,rate) for _ in range(num_layers)]
        self.dropout = keras.layers.Dropout(rate)
        
    def call(self, enc_output, x, training, padding_mask):
        seq_len = tf.shape(x)[1]
        # input embedding
        x = self.embedding(x)
        # positional embedding
        x += self.pos_embedding
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.decoder_layers[i](x, enc_output, training, padding_mask)
            
        return x

In [10]:
# Encoder和Decoder组合成Transformer，继承keras.Model实现
class Transformer(keras.Model):
    def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rete=0.1):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
        self.decoder = Decoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
        self.final_dense = keras.layers.Dense(vocab_size, name="final_output")
        
    def call(self, inputs, targets, training, enc_padding_mask, dec_padding_mask):
        enc_output = self.encoder(inputs, training, enc_padding_mask)
        dec_output = self.decoder(enc_output, targets, training, dec_padding_mask)
        
        final_output = self.final_dense(dec_output)
        
        return final_output

In [27]:
# Encoder和Decoder组合成Transformer
def transformer(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen):
    enc_inputs = keras.Input(shape=(maxlen,), name="enc_input")
    dec_inputs = keras.Input(shape=(maxlen,), name="dec_input")
    dec_outputs = keras.Input(shape=(maxlen,), name="dec_output")

    encoder = Encoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
    decoder = Decoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
    final_dense = Keras.layers.Dense(vocab_size, name="final_output")

    enc_output = encoder(enc_inputs, True, None)
    dec_output = decoder(enc_output, dec_inputs, True, None)

    final_output = final_dense(dec_output)

    model = keras.models.Model(inputs=[enc_inputs, dec_inputs], outputs=final_output)

    return model

In [28]:
K.clear_session()

num_layers=2 
model_size=128
num_heads=4
dff_size=256
vocab_size=1000
maxlen = 10

model = transformer(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
enc_input (InputLayer)          [(None, 10)]         0                                            
__________________________________________________________________________________________________
encoder (Encoder)               (None, 10, 128)      392962      enc_input[0][0]                  
__________________________________________________________________________________________________
dec_input (InputLayer)          [(None, 10)]         0                                            
__________________________________________________________________________________________________
decoder (Decoder)               (None, 10, 128)      525572      encoder[0][0]                    
______________________________________________________________________________________________

In [31]:
encoder = model.get_layer("encoder")
print(encoder.summary())
print(encoder.input)
print(encoder.output)

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 128)           128000    
_________________________________________________________________
encoder_layer (EncoderLayer) multiple                  132481    
_________________________________________________________________
encoder_layer_1 (EncoderLaye multiple                  132481    
_________________________________________________________________
dropout_4 (Dropout)          multiple                  0         
Total params: 392,962
Trainable params: 392,962
Non-trainable params: 0
_________________________________________________________________
None
Tensor("enc_input:0", shape=(None, 10), dtype=float32)
Tensor("encoder/Identity:0", shape=(None, 10, 128), dtype=float32)


In [19]:
K.clear_session()

maxlen = 10
num_layers=2
model_size=128
num_heads=4
dff_size=256
vocab_size=1000

enc_inputs = Input(shape=(maxlen,), name="enc_input")
dec_inputs = Input(shape=(maxlen,), name="dec_input")
dec_outputs = Input(shape=(maxlen,), name="dec_output")

transformer = Transformer(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)

outputs = transformer(enc_inputs, dec_inputs, training=True, enc_padding_mask=None, dec_padding_mask=None)

model = Model(inputs=[enc_inputs, dec_inputs], outputs=outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
enc_input (InputLayer)          [(None, 10)]         0                                            
__________________________________________________________________________________________________
dec_input (InputLayer)          [(None, 10)]         0                                            
__________________________________________________________________________________________________
transformer (Transformer)       (None, 10, 1000)     1047534     enc_input[0][0]                  
Total params: 1,047,534
Trainable params: 1,047,534
Non-trainable params: 0
__________________________________________________________________________________________________


In [20]:
sub_model_transformer = model.get_layer("transformer")
sub_model_transformer.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            (None, 10, 128)           392962    
_________________________________________________________________
decoder (Decoder)            (None, 10, 128)           525572    
_________________________________________________________________
final_output (Dense)         (None, 10, 1000)          129000    
Total params: 1,047,534
Trainable params: 1,047,534
Non-trainable params: 0
_________________________________________________________________


In [21]:
encoder = sub_model_transformer.get_layer("encoder")
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 128)           128000    
_________________________________________________________________
encoder_layer (EncoderLayer) multiple                  132481    
_________________________________________________________________
encoder_layer_1 (EncoderLaye multiple                  132481    
_________________________________________________________________
dropout_4 (Dropout)          multiple                  0         
Total params: 392,962
Trainable params: 392,962
Non-trainable params: 0
_________________________________________________________________


In [22]:
decoder = sub_model_transformer.get_layer("decoder")
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 128)           128000    
_________________________________________________________________
decoder_layer (DecoderLayer) multiple                  198786    
_________________________________________________________________
decoder_layer_1 (DecoderLaye multiple                  198786    
_________________________________________________________________
dropout_5 (Dropout)          multiple                  0         
Total params: 525,572
Trainable params: 525,572
Non-trainable params: 0
_________________________________________________________________
