In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn,tf,keras:
    print(module.__name__,module.__version__)
    

2.0.0
sys.version_info(major=3, minor=6, micro=2, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.0
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
## 实现步骤：
## 1.载入数据
## 2.数据预处理----》dataset
## 3.tools
##  3.1 generates position embedding
##  3.2 create mask. (a.padding ,b.decoder)
##  3.3 scaled_dot_product_attention
## 4. builds  model
##    4.1 MultiheadAttenton
##    4.2 EncoderLayer
##    4.3 DecoderLayer
##    4.4 EncoderModel
##    4.5 DecoderModel
##    4.6 Transformer
## 5.optimizer & loss
## 6.train step  --> train
## 7.Evaluate and Visualize

In [3]:
## 1.载入数据
## 处理符切词及空格等
import re

def preprocess_sentence(w,index):
    if index==1:
        return preprocess_sentence_cn(w)
    w = w.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w) # 标点符号前加空格
    w = re.sub(r'[" "]+', " ", w) # 空格去重

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z0-9?.!,]+", " ", w)

    w = w.strip()# 去掉前后空格

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
#     w = '<start> ' + w + ' <end>'
    return w

import jieba
def preprocess_sentence_cn(w):
    w = w.strip()

    # 使用 jieba分词
    word_iter = jieba.cut(w)

    word_content = ''
    for word in word_iter:
        word = word.strip(' ')
        if word != '':
            word_content += word + ' '


    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = word_content.strip(' ')
    return w

# en_sentence=u"May I borrow this book?"
# cn_sentence=u"我可以借这本书吗？"
# print(preprocess_sentence(en_sentence,0))
# print(preprocess_sentence(cn_sentence,1))

In [4]:
## 1.载入数据
data_path='data\\news-commentary-v14.en-zh.tsv'

def create_dataset(path,num_examples):
    lines = open(path,encoding='UTF-8').read().strip().split('\n')
    word_pairs = [ [ preprocess_sentence(w,index) for index,w in enumerate(line.split('\t')) ] for line in lines[:num_examples]]
    print('len word_pairs:',len(word_pairs))
    return zip(*word_pairs)

en,cn = create_dataset(data_path,20000)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\jandy\AppData\Local\Temp\jieba.cache
Loading model cost 0.751 seconds.
Prefix dict has been built successfully.


len word_pairs: 20000


In [5]:
## 1.载入数据
from sklearn.model_selection import train_test_split

# 测试集与验证集的拆分
input_train_cn,input_val_cn,target_train_en,target_val_en = train_test_split(cn,en,test_size=0.2)
len(input_train_cn),len(target_train_en),len(input_val_cn),len(target_val_en)

(16000, 16000, 4000, 4000)

In [6]:
## 2.数据预处理----》dataset
## 把词转根据词频转成ID
import tensorflow_datasets as tfds

# cn_test=np.array(input_train_cn[:3])
# en_test=np.array(target_train_en[:3])
# print(input_train_cn[:3])
# print(target_train_en[:3])

en_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    en, target_vocab_size= 2**13)
cn_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    cn, target_vocab_size= 2**13)

print("vocab_size:",en_tokenizer.vocab_size,cn_tokenizer.vocab_size)



vocab_size: 8279 8125


In [7]:
## 2.数据预处理----》dataset
sample_string = "Transformer is awesome."

tokenized_string = en_tokenizer.encode(sample_string)
print('Tokenized string is {}'.format(tokenized_string))

origin_string = en_tokenizer.decode(tokenized_string)
print('The original string is {}'.format(origin_string))

assert origin_string == sample_string

for token in tokenized_string:
    print('{} --> "{}"'.format(token, en_tokenizer.decode([token])))
    
sample_string_cn = "随着经济危机不断加深和蔓延，"

tokenized_string = cn_tokenizer.encode(sample_string_cn)
print('Tokenized string is {}'.format(tokenized_string))

origin_string = cn_tokenizer.decode(tokenized_string)
print('The original string is {}'.format(origin_string))

assert origin_string == sample_string_cn

for token in tokenized_string:
    print('{} --> "{}"'.format(token, cn_tokenizer.decode([token])))

Tokenized string is [8107, 1940, 8138, 428, 11, 2456, 185, 1715, 8069]
The original string is Transformer is awesome.
8107 --> "T"
1940 --> "ran"
8138 --> "s"
428 --> "former "
11 --> "is "
2456 --> "aw"
185 --> "es"
1715 --> "ome"
8069 --> "."
Tokenized string is [1469, 988, 303, 767, 73, 1289, 393, 730, 766, 6246, 8108, 8057, 8009]
The original string is 随着经济危机不断加深和蔓延，
1469 --> "随"
988 --> "着"
303 --> "经济"
767 --> "危机"
73 --> "不"
1289 --> "断"
393 --> "加"
730 --> "深"
766 --> "和"
6246 --> "蔓延"
8108 --> "�"
8057 --> "�"
8009 --> "�"


In [8]:
## 2.数据预处理----》dataset
buffer_size = len(input_train_cn)
batch_size = 64
max_length = 180

## 转码后，加上，前后缀
def encode_to_subword(cn_sentence, en_sentence):
    cn_sequence = [cn_tokenizer.vocab_size] \
    + cn_tokenizer.encode(cn_sentence.numpy()) \
    + [cn_tokenizer.vocab_size + 1]
    en_sequence = [en_tokenizer.vocab_size] \
    + en_tokenizer.encode(en_sentence.numpy()) \
    + [en_tokenizer.vocab_size + 1]
    return cn_sequence, en_sequence

def filter_by_max_length(cn, en):
    return tf.logical_and(tf.size(cn) <= max_length,
                          tf.size(en) <= max_length)

def tf_encode_to_subword(cn_sentence, en_sentence):
    return tf.py_function(encode_to_subword,
                          [cn_sentence, en_sentence],
                          [tf.int64, tf.int64])


train_examples = tf.data.Dataset.from_tensor_slices((input_train_cn, target_train_en))
train_dataset = train_examples.map(tf_encode_to_subword)
train_dataset = train_dataset.filter(filter_by_max_length)
train_dataset = train_dataset.shuffle(
    buffer_size).padded_batch(
    batch_size, padded_shapes=([-1], [-1]))


val_examples = tf.data.Dataset.from_tensor_slices((input_val_cn, target_val_en))
valid_dataset = val_examples.map(tf_encode_to_subword)
valid_dataset = valid_dataset.filter(
    filter_by_max_length).padded_batch(
    batch_size, padded_shapes=([-1], [-1]))

for cn_batch, en_batch in train_dataset.take(3):
    print(cn_batch.shape, en_batch.shape)




(64, 65) (64, 54)
(64, 65) (64, 67)
(64, 69) (64, 65)


In [9]:
## 3.tools
##  3.1 generates position embedding
##  3.2 create mask. (a.padding ,b.decoder)
##  3.3 scaled_dot_product_attention

"""
1、生成位置编码 generates position embedding
"""
# PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
# PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

# pos.shape: [sentence_length, 1]
# i.shape  : [1, d_model]
# result.shape: [sentence_length, d_model]
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000,
                               (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates


def get_position_embedding(sentence_length, d_model):
    angle_rads = get_angles(np.arange(sentence_length)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # sines.shape: [sentence_length, d_model / 2]
    # cosines.shape: [sentence_length, d_model / 2]
    sines = np.sin(angle_rads[:, 0::2])   ## 所有的偶数
    cosines = np.cos(angle_rads[:, 1::2])  ## 所有的奇数

    # position_embedding.shape: [sentence_length, d_model]
    position_embedding = np.concatenate([sines, cosines], axis=-1)
    # position_embedding.shape: [1, sentence_length, d_model]
    position_embedding = position_embedding[np.newaxis, ...]

    return tf.cast(position_embedding, dtype=tf.float32)

## for test
# position_embedding = get_position_embedding(50, 512)  ## 测试
# print(position_embedding.shape)


def plot_position_embedding(position_embedding):
    plt.pcolormesh(position_embedding[0], cmap='RdBu')
    plt.xlabel('Depth')
    plt.xlim((0, 512))
    plt.ylabel('Position')
    plt.colorbar()
    plt.show()


# plot_position_embedding(position_embedding)

"""
2、create mask.  
   a.padding
   b.decoder
"""
# batch_data.shape: [batch_size, seq_len]
def create_padding_mask(batch_data):
    padding_mask = tf.cast(tf.math.equal(batch_data, 0), tf.float32)
    # [batch_size, 1, 1, seq_len]
    return padding_mask[:, tf.newaxis, tf.newaxis, :]

# x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
# print(create_padding_mask(x)) #for test
# tf.Tensor(
# [[[[0. 0. 1. 1. 0.]]]
#  [[[0. 0. 0. 1. 1.]]]
#  [[[1. 1. 1. 0. 0.]]]], shape=(3, 1, 1, 5), dtype=float32)

# attention_weights.shape: [3,3]
# [[1, 0, 0],
#  [4, 5, 0],
#  [7, 8, 9]]
##   让上三角都为0
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask # (seq_len, seq_len)

# print(create_look_ahead_mask(5)) # for test
# tf.Tensor(
# [[0. 1. 1. 1. 1.]
#  [0. 0. 1. 1. 1.]
#  [0. 0. 0. 1. 1.]
#  [0. 0. 0. 0. 1.]
#  [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32)



"""
3 scaled_dot_product_attention  缩放点积注意力机制实现
"""


def scaled_dot_product_attention(q, k, v, mask):
    """
    Args:
    - q: shape == (..., seq_len_q, depth)
    - k: shape == (..., seq_len_k, depth)
    - v: shape == (..., seq_len_v, depth_v)
    - seq_len_k == seq_len_v
    - mask: shape == (..., seq_len_q, seq_len_k)
    Returns:
    - output: weighted sum
    - attention_weights: weights of attention
    """

    # matmul_qk.shape: (..., seq_len_q, seq_len_k)
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        # 使得在softmax后值趋近于0
        scaled_attention_logits += (mask * -1e9)

    # attention_weights.shape: (..., seq_len_q, seq_len_k)
    attention_weights = tf.nn.softmax(
        scaled_attention_logits, axis=-1)

    # output.shape: (..., seq_len_q, depth_v)
    output = tf.matmul(attention_weights, v)

    return output, attention_weights

## 为了方便调试用
def print_scaled_dot_product_attention(q, k, v):
    temp_out, temp_att = scaled_dot_product_attention(q, k, v, None)
    print("Attention weights are:")
    print(temp_att)
    print("Output is:")
    print(temp_out)


### for test ,3、缩放点积注意力机制


"""
encoder ayer and decoder layer will use.
"""

def feed_forward_network(d_model, dff):
    # dff: dim of feed forward network.
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

In [10]:
## 4. builds  model
##    4.1 MultiheadAttenton
##    4.2 EncoderLayer
##    4.3 DecoderLayer
##    4.4 EncoderModel
##    4.5 DecoderModel
class MultiHeadAttention(tf.keras.layers.Layer):
    """
    理论上:
    x -> Wq0 -> q0
    x -> Wk0 -> k0
    x -> Wv0 -> v0

    实战中:
    q -> Wq0 -> q0
    k -> Wk0 -> k0
    v -> Wv0 -> v0

    实战中技巧：
    q -> Wq -> Q -> split -> q0, q1, q2...
    """

    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert self.d_model % self.num_heads == 0

        self.depth = self.d_model // self.num_heads

        self.WQ = tf.keras.layers.Dense(self.d_model)
        self.WK = tf.keras.layers.Dense(self.d_model)
        self.WV = tf.keras.layers.Dense(self.d_model)

        self.dense = tf.keras.layers.Dense(self.d_model)

    def split_heads(self, x, batch_size):
        # x.shape: (batch_size, seq_len, d_model)
        # d_model = num_heads * depth
        # x -> (batch_size, num_heads, seq_len, depth)

        x = tf.reshape(x,
                       (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v, mask):
        batch_size = tf.shape(q)[0]

        q = self.WQ(q)  # q.shape: (batch_size, seq_len_q, d_model)
        k = self.WK(k)  # k.shape: (batch_size, seq_len_k, d_model)
        v = self.WV(v)  # v.shape: (batch_size, seq_len_v, d_model)

        # q.shape: (batch_size, num_heads, seq_len_q, depth)
        q = self.split_heads(q, batch_size)
        # k.shape: (batch_size, num_heads, seq_len_k, depth)
        k = self.split_heads(k, batch_size)
        # v.shape: (batch_size, num_heads, seq_len_v, depth)
        v = self.split_heads(v, batch_size)

        # scaled_attention_outputs.shape: (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape: (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention_outputs, attention_weights = \
            scaled_dot_product_attention(q, k, v, mask)

        # scaled_attention_outputs.shape: (batch_size, seq_len_q, num_heads, depth)
        scaled_attention_outputs = tf.transpose(
            scaled_attention_outputs, perm=[0, 2, 1, 3])
        # concat_attention.shape: (batch_size, seq_len_q, d_model)
        concat_attention = tf.reshape(scaled_attention_outputs,
                                      (batch_size, -1, self.d_model))

        # output.shape : (batch_size, seq_len_q, d_model)
        output = self.dense(concat_attention)

        return output, attention_weights

In [11]:
##    4.2 EncoderLayer
##    4.3 DecoderLayer
class EncoderLayer(keras.layers.Layer):
    """
    x -> self attention -> add & normalize & dropout
      -> feed_forward -> add & normalize & dropout
    """

    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = feed_forward_network(d_model, dff)

        self.layer_norm1 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(
            epsilon=1e-6)

        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, x, training, encoder_padding_mask):
        # x.shape          : (batch_size, seq_len, dim=d_model)
        # attn_output.shape: (batch_size, seq_len, d_model)
        # out1.shape       : (batch_size, seq_len, d_model)
        attn_output, _ = self.mha(x, x, x, encoder_padding_mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layer_norm1(x + attn_output)

        # ffn_output.shape: (batch_size, seq_len, d_model)
        # out2.shape      : (batch_size, seq_len, d_model)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layer_norm2(out1 + ffn_output)

        return out2


# sample_encoder_layer = EncoderLayer(512, 8, 2048)
# sample_input = tf.random.uniform((64, 50, 512))
# sample_output = sample_encoder_layer(sample_input, False, None)
# print(sample_output.shape)
#
# print('-------decoder---------')

class DecoderLayer(keras.layers.Layer):
    """
    x -> self attention -> add & normalize & dropout -> out1
    out1 , encoding_outputs -> attention -> add & normalize & dropout -> out2
    out2 -> ffn -> add & normalize & dropout -> out3
    """

    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = feed_forward_network(d_model, dff)

        self.layer_norm1 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        self.layer_norm3 = keras.layers.LayerNormalization(
            epsilon=1e-6)

        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        self.dropout3 = keras.layers.Dropout(rate)

    def call(self, x, encoding_outputs, training,
             decoder_mask, encoder_decoder_padding_mask):
        # decoder_mask: 由look_ahead_mask和decoder_padding_mask合并而来

        # x.shape: (batch_size, target_seq_len, d_model)
        # encoding_outputs.shape: (batch_size, input_seq_len, d_model)

        # attn1, out1.shape : (batch_size, target_seq_len, d_model)
        attn1, attn_weights1 = self.mha1(x, x, x, decoder_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layer_norm1(attn1 + x)

        # attn2, out2.shape : (batch_size, target_seq_len, d_model)
        attn2, attn_weights2 = self.mha2(
            out1, encoding_outputs, encoding_outputs,
            encoder_decoder_padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layer_norm2(attn2 + out1)

        # ffn_output, out3.shape: (batch_size, target_seq_len, d_model)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layer_norm3(ffn_output + out2)

        return out3, attn_weights1, attn_weights2


# sample_decoder_layer = DecoderLayer(512, 8, 2048)
# sample_decoder_input = tf.random.uniform((64, 60, 512))
# sample_decoder_output, sample_decoder_attn_weights1, sample_decoder_attn_weights2 = sample_decoder_layer(
#     sample_decoder_input, sample_output, False, None, None)

# print(sample_decoder_output.shape)
# print(sample_decoder_attn_weights1.shape)
# print(sample_decoder_attn_weights2.shape)

In [12]:
##    4.4 EncoderModel
##    4.5 DecoderModel
class EncoderModel(keras.layers.Layer):
    def __init__(self, num_layers, input_vocab_size, max_length,
                 d_model, num_heads, dff, rate=0.1):
        super(EncoderModel, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.max_length = max_length

        self.embedding = keras.layers.Embedding(input_vocab_size,
                                                self.d_model)
        # position_embedding.shape: (1, max_length, d_model)
        self.position_embedding = get_position_embedding(max_length,
                                                         self.d_model)

        self.dropout = keras.layers.Dropout(rate)
        self.encoder_layers = [
            EncoderLayer(d_model, num_heads, dff, rate)
            for _ in range(self.num_layers)]

    def call(self, x, training, encoder_padding_mask):
        # x.shape: (batch_size, input_seq_len)
        input_seq_len = tf.shape(x)[1]
        tf.debugging.assert_less_equal(
            input_seq_len, self.max_length,
            "input_seq_len should be less or equal to self.max_length")

        # x.shape: (batch_size, input_seq_len, d_model)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.position_embedding[:, :input_seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.encoder_layers[i](x, training,
                                       encoder_padding_mask)

        # x.shape: (batch_size, input_seq_len, d_model)
        return x


# sample_encoder_model = EncoderModel(2, 8500, max_length,
#                                     512, 8, 2048)
# sample_encoder_model_input = tf.random.uniform((64, 37))
# sample_encoder_model_output = sample_encoder_model(
#     sample_encoder_model_input, False, encoder_padding_mask=None)
# print(sample_encoder_model_output.shape)
#
# print('-----decoderModel---')

class DecoderModel(keras.layers.Layer):
    def __init__(self, num_layers, target_vocab_size, max_length,
                 d_model, num_heads, dff, rate=0.1):
        super(DecoderModel, self).__init__()
        self.num_layers = num_layers
        self.max_length = max_length
        self.d_model = d_model

        self.embedding = keras.layers.Embedding(target_vocab_size,
                                                d_model)
        self.position_embedding = get_position_embedding(max_length,
                                                         d_model)

        self.dropout = keras.layers.Dropout(rate)
        self.decoder_layers = [
            DecoderLayer(d_model, num_heads, dff, rate)
            for _ in range(self.num_layers)]

    def call(self, x, encoding_outputs, training,
             decoder_mask, encoder_decoder_padding_mask):
        # x.shape: (batch_size, output_seq_len)
        output_seq_len = tf.shape(x)[1]
        tf.debugging.assert_less_equal(
            output_seq_len, self.max_length,
            "output_seq_len should be less or equal to self.max_length")

        attention_weights = {}

        # x.shape: (batch_size, output_seq_len, d_model)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.position_embedding[:, :output_seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, attn1, attn2 = self.decoder_layers[i](
                x, encoding_outputs, training,
                decoder_mask, encoder_decoder_padding_mask)
            attention_weights[
                'decoder_layer{}_att1'.format(i + 1)] = attn1
            attention_weights[
                'decoder_layer{}_att2'.format(i + 1)] = attn2
        # x.shape: (batch_size, output_seq_len, d_model)
        return x, attention_weights


# sample_decoder_model = DecoderModel(2, 8000, max_length,
#                                     512, 8, 2048)

# sample_decoder_model_input = tf.random.uniform((64, 35))
# sample_decoder_model_output, sample_decoder_model_att \
#     = sample_decoder_model(
#     sample_decoder_model_input,
#     sample_encoder_model_output,
#     training=False, decoder_mask=None,
#     encoder_decoder_padding_mask=None)
# s
# print(sample_decoder_model_output.shape)
# for key in sample_decoder_model_att:
#     print(sample_decoder_model_att[key].shape)

In [13]:
##    4.6 Transformer
class Transformer(keras.Model):
    def __init__(self, num_layers, input_vocab_size, target_vocab_size,
                 max_length, d_model, num_heads, dff, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder_model = EncoderModel(
            num_layers, input_vocab_size, max_length,
            d_model, num_heads, dff, rate)

        self.decoder_model = DecoderModel(
            num_layers, target_vocab_size, max_length,
            d_model, num_heads, dff, rate)

        self.final_layer = keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, encoder_padding_mask,
             decoder_mask, encoder_decoder_padding_mask):
        # encoding_outputs.shape: (batch_size, input_seq_len, d_model)
        encoding_outputs = self.encoder_model(
            inp, training, encoder_padding_mask)

        # decoding_outputs.shape: (batch_size, output_seq_len, d_model)
        decoding_outputs, attention_weights = self.decoder_model(
            tar, encoding_outputs, training,
            decoder_mask, encoder_decoder_padding_mask)

        # predictions.shape: (batch_size, output_seq_len, target_vocab_size)
        predictions = self.final_layer(decoding_outputs)

        return predictions, attention_weights

In [14]:
## 5.optimizer & loss
## 6.train step  --> train
## 7.Evaluate and Visualize

In [15]:
def create_masks(inp, tar):
    """
    Encoder:
      - encoder_padding_mask (self attention of EncoderLayer)
    Decoder:
      - look_ahead_mask (self attention of DecoderLayer)   ### 不能看到之后的值
      - encoder_decoder_padding_mask (encoder-decoder attention of DecoderLayer)   ### decooder 不能看encoder的mask
      - decoder_padding_mask (self attention of DecoderLayer)
    """
    encoder_padding_mask = create_padding_mask(inp)
    encoder_decoder_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    decoder_padding_mask = create_padding_mask(tar)
    decoder_mask = tf.maximum(decoder_padding_mask,
                              look_ahead_mask)   ### 合并操作

    return encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask

In [16]:
## 初始化模型前先定义一些超参数
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = cn_tokenizer.vocab_size + 2
target_vocab_size = en_tokenizer.vocab_size + 2

dropout_rate = 0.1

#### 模型构建##
transformer = Transformer(num_layers,
                          input_vocab_size,
                          target_vocab_size,
                          max_length,
                          d_model, num_heads, dff, dropout_rate)

In [17]:
"""
自定义学习率
"""
# lrate = (d_model ** -0.5) * min(step_num ** (-0.5),
#                                 step_num * warm_up_steps **(-1.5))

class CustomizedSchedule(
    keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomizedSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** (-1.5))

        arg3 = tf.math.rsqrt(self.d_model)

        return arg3 * tf.math.minimum(arg1, arg2)

# d_model = 128
# learning_rate = CustomizedSchedule(d_model)
# temp_learning_rate_schedule = CustomizedSchedule(d_model)
#
# plt.plot(
#     learning_rate(
#         tf.range(40000, dtype=tf.float32)))
# plt.ylabel("Leraning rate")
# plt.xlabel("Train step")
# plt.show()

"""
自定义损失函数
"""

loss_object = keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [18]:
## 5.optimizer & loss
learning_rate = CustomizedSchedule(d_model)
optimizer = keras.optimizers.Adam(learning_rate,
                                  beta_1 = 0.9,
                                  beta_2 = 0.98,
                                  epsilon = 1e-9)

train_loss = keras.metrics.Mean(name='train_loss')
train_accuracy = keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [19]:
## 6.train step  --> train
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask \
        = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, True,
                                     encoder_padding_mask,
                                     decoder_mask,
                                     encoder_decoder_padding_mask)
        loss = loss_function(tar_real, predictions)  ## 计算损失函数

    gradients = tape.gradient(loss, transformer.trainable_variables) ## 计算梯度
    optimizer.apply_gradients(
        zip(gradients, transformer.trainable_variables))   ### 梯度应用到变量去
    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [20]:
epochs=3 
for epoch in range(epochs):
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch,(inp,tar)) in enumerate(train_dataset):
        train_step(inp,tar)
        if batch % 30 ==0:
#             print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(train_accuracy.result()))
            print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                epoch + 1, batch, train_loss.result(),
                train_accuracy.result()))
    print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch+1,train_loss.result(),train_accuracy.result()))
    print('Time take for 1 epoch: {} secconds \n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.3185 Accuracy 0.0000












































































































Epoch 1 Batch 30 Loss 3.5648 Accuracy 0.0001
























































































































Epoch 1 Batch 60 Loss 3.6824 Accuracy 0.0008








































































































Epoch 1 Batch 90 Loss 3.6439 Accuracy 0.0059




















































































































Epoch 1 Batch 120 Loss 3.6394 Accuracy 0.0091








































































































Epoch 1 Batch 150 Loss 3.6132 Accuracy 0.0110








































































































Epoch 1 Batch 180 Loss 3.6041 Accuracy 0.0122












































































































Epoch 1 Batch 210 Loss 3.5910 Accuracy 0.0131
































































































Epoch 1 Batch 240 Loss 3.5599 Accuracy 0.0140
























Epoch 1 Loss 3.5549 Accuracy 0.0143
Time take for 1 epoch: 893.9250090122223 secconds 

Epoch 2 Batch 0 Loss 3.1262 Accuracy 0.0250








































































Epoch 2 Batch 30 Loss 3.2402 Accuracy 0.0272












































































Epoch 2 Batch 60 Loss 3.1880 Accuracy 0.0307
























































































Epoch 2 Batch 90 Loss 3.1516 Accuracy 0.0324
















































































Epoch 2 Batch 120 Loss 3.1249 Accuracy 0.0345








































































Epoch 2 Batch 150 Loss 3.0812 Accuracy 0.0360




















































Epoch 2 Batch 180 Loss 3.0539 Accuracy 0.0376




































































Epoch 2 Batch 210 Loss 3.0139 Accuracy 0.0388








































































Epoch 2 Batch 240 Loss 2.9953 Accuracy 0.0406












Epoch 2 Loss 2.9867 Accuracy 0.0411
Time take for 1 epoch: 793.5122029781342 secconds 





Epoch 3 Batch 0 Loss 2.7451 Accuracy 0.0502
































































Epoch 3 Batch 30 Loss 2.8856 Accuracy 0.0569
























































Epoch 3 Batch 60 Loss 2.8006 Accuracy 0.0563




































































Epoch 3 Batch 90 Loss 2.7510 Accuracy 0.0568
























Epoch 3 Batch 120 Loss 2.7234 Accuracy 0.0583












































Epoch 3 Batch 150 Loss 2.6947 Accuracy 0.0593




















Epoch 3 Batch 180 Loss 2.6838 Accuracy 0.0608












































Epoch 3 Batch 210 Loss 2.6716 Accuracy 0.0621




































































Epoch 3 Batch 240 Loss 2.6481 Accuracy 0.0631








Epoch 3 Loss 2.6449 Accuracy 0.0636
Time take for 1 epoch: 2830.304355621338 secconds 



In [None]:
"""
eg: A B C D -> E F G H.
Train: A B C D, E F G -> F G H
Eval:  A B C D -> E
       A B C D, E -> F
       A B C D, E F -> G
       A B C D, E F G -> H
"""


def evaluate(inp_sentence,x_tokenizer,y_tokenizer,max_length,transformer):
    input_id_sentence = [x_tokenizer.vocab_size] \
                        + x_tokenizer.encode(inp_sentence) + [x_tokenizer.vocab_size + 1]   # 文本转换为ID
    # encoder_input.shape: (1, input_sentence_length)
    encoder_input = tf.expand_dims(input_id_sentence, 0)

    # decoder_input.shape: (1, 1)
    decoder_input = tf.expand_dims([y_tokenizer.vocab_size], 0)

    for i in range(max_length):
        encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask \
            = create_masks(encoder_input, decoder_input)
        # predictions.shape: (batch_size, output_target_len, target_vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            decoder_input,
            False,
            encoder_padding_mask,
            decoder_mask,
            encoder_decoder_padding_mask)
        # predictions.shape: (batch_size, target_vocab_size)
        predictions = predictions[:, -1, :]

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1),
                               tf.int32)

        if tf.equal(predicted_id, y_tokenizer.vocab_size + 1):
            return tf.squeeze(decoder_input, axis=0), attention_weights

        decoder_input = tf.concat([decoder_input, [predicted_id]],
                                  axis=-1)
    return tf.squeeze(decoder_input, axis=0), attention_weights


def plot_encoder_decoder_attention(attention, input_sentence,
                                   result, layer_name,x_tokenizer,y_tokenizer):
    fig = plt.figure(figsize=(16, 8))

    input_id_sentence = x_tokenizer.encode(input_sentence)

    # attention.shape before squeeze: (1, num_heads, tar_len, input_len)
    # attention.shape after squeeze: (num_heads, tar_len, input_len)
    attention = tf.squeeze(attention[layer_name], axis=0)

    for head in range(attention.shape[0]):
        ax = fig.add_subplot(2, 4, head + 1)

        ax.matshow(attention[head][:-1, :])

        fontdict = {'fontsize': 10}

        ax.set_xticks(range(len(input_id_sentence) + 2))
        ax.set_yticks(range(len(result)))

        ax.set_ylim(len(result) - 1.5, -0.5)

        ax.set_xticklabels(
            ['<start>'] + [x_tokenizer.decode([i]) for i in input_id_sentence] + ['<end>'],
            fontdict=fontdict, rotation=90)
        ax.set_yticklabels(
            [y_tokenizer.decode([i]) for i in result if i < y_tokenizer.vocab_size],
            fontdict=fontdict)
        ax.set_xlabel('Head {}'.format(head + 1))
    plt.tight_layout()
    plt.show()

In [None]:
def translate(input_sentence, layer_name=''):
    result, attention_weights = evaluate(input_sentence,cn_tokenizer,en_tokenizer,max_length,transformer)

    predicted_sentence = en_tokenizer.decode(
        [i for i in result if i < en_tokenizer.vocab_size])

    print("Input: {}".format(input_sentence))
    print("Predicted translation: {}".format(predicted_sentence))

    if layer_name:
        plot_encoder_decoder_attention(attention_weights, input_sentence,
                                       result, layer_name,pt_tokenizer,en_tokenizer)

In [None]:
translate('我是谁？')