<a href="https://colab.research.google.com/github/kyle-gao/TF_Transformer/blob/master/TF_Transformer_FR_EN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Copyright 2020 Yi Lin(Kyle) Gao


##### Copyright 2019 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [15]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time

In [36]:
num_layers = 4
d_model = 128
dense_dim = 256
num_heads = 8


max_len_en = 50
min_len_en = 10  # The transcript has many short lines indicating the date or speaker, which we should filter out.
max_len_fr = 50
min_len_fr = 10

eng_path = "Data/en.txt"
fr_path = "Data/fr.txt"
EPOCHS = 20
en_ds = tf.data.TextLineDataset(eng_path)
fr_ds = tf.data.TextLineDataset(fr_path)
ds = tf.data.Dataset.zip((fr_ds, en_ds))

tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file("Data/tokenizer_en")
tokenizer_fr = tfds.features.text.SubwordTextEncoder.load_from_file("Data/tokenizer_fr")
en_vocab_size = tokenizer_en.vocab_size + 2
fr_vocab_size = tokenizer_fr.vocab_size + 2

In [17]:
def encode(fr, eng):
    # Adds start token (tokenizer.vocab_size) and end token (tokenizer.vocab_size + 1) to (question,answer)
    question = [tokenizer_fr.vocab_size] + tokenizer_fr.encode(fr.numpy()) + [tokenizer_fr.vocab_size + 1]
    answer = [tokenizer_en.vocab_size] + tokenizer_en.encode(eng.numpy()) + [tokenizer_en.vocab_size + 1]

    return question, answer


def tf_interleave_encode(question, answer):
    # We have to wrap encode in a tf.py_function() and return a Dataset so we can use Dataset.interleave()
    question, answer = tf.py_function(encode, [question, answer], [tf.int64, tf.int64])
    question.set_shape([None])
    answer.set_shape([None])

    return tf.data.Dataset.from_tensors((question, answer))


def filter_max_length(x, y, max_length_question=max_len_fr, max_length_answer=max_len_en):
    return tf.logical_and(tf.size(x) <= max_length_question,
                          tf.size(y) <= max_length_answer)


def filter_min_length(x, y, min_len_question=min_len_fr, min_len_answer=min_len_en):
    return tf.logical_and(tf.size(x) >= min_len_question,
                          tf.size(y) >= min_len_answer)


def preprocess(dataset, batch_size, pad_len_question=max_len_fr, pad_length_answer=max_len_en):
    dataset = dataset.cache()
    # dataset = dataset.map(tf_encode)
    dataset = dataset.interleave(tf_interleave_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.filter(filter_max_length)
    dataset = dataset.filter(filter_min_length)
    dataset = dataset.shuffle(10000)

    dataset = dataset.padded_batch(batch_size, drop_remainder=True,
                                   padded_shapes=([pad_len_question], [pad_length_answer]))
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [35]:
train_dataset = preprocess(ds, 64)

In [19]:
def positional_encoding(pos, d_model):
    """
    :param pos: int max position
    :param d_model: dimension of the model
    :return: (1,pos,d_model) array of sinusoidal positional encoding
    """
    pos_enc = np.zeros((1, pos, d_model))
    for p in range(pos):
        for i in range(d_model // 2):
            angles = p / np.power(10000, (2 * i) / np.float32(d_model))
            pos_enc[:, p, 2 * i] = np.sin(angles)
            pos_enc[:, p, 2 * i + 1] = np.cos(angles)
        if d_model % 2 == 1:
            # if d_model is odd loop doesn't hit last even index
            angles = p / np.power(10000, (2 * d_model) / np.float32(d_model))
            pos_enc[:, p, d_model - 1] = np.sin(angles)
    return tf.cast(pos_enc, tf.float32)


def padding_mask(seq):
    # Returns (batch, seq_len, 1, 1) tensor with 0's where the sequence is padded, 1 where it is not

    mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return mask[:, tf.newaxis, :,  tf.newaxis]  # (batch, 1, seq_len, 1) m l j h  <- j gets masked


def forward_mask(seq):
    """
    Calculates a combined forward mask and padding mask for a batch of sequences
    :param seq: (batch,seq_len) a batch of sequences
    :return:  a combined look_ahead_mask (lower triangular 1s)
    and padding mask (batch, seq_len, seq_len, 1)
    """
    seq_len = tf.shape(seq)[1]

    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    look_ahead_mask = look_ahead_mask[tf.newaxis, :, :, tf.newaxis]  # (batch, seq_len, seq_len, 1)

    padded_mask = padding_mask(seq)

    # return padded_mask * look_ahead_mask  # (batch, seq_len, seq_len, 1)
    return tf.maximum(padded_mask, look_ahead_mask)


In [20]:

class MultiHeadAttention(tf.keras.layers.Layer):

    """Implemented with tf.einsum(), is faster than using tf.transpose() with tf.matmul()"""

    def __init__(self, d_model, num_heads):
        super().__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads,depth)

        Arguments:
        x -- A tokenized sequence (batch_size, seq_len, d_model)

        Returns:
        A tokenized sequence with dimensions (batch_size, seq_len, num_heads, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))

        return x

    def call(self, q, k, v, mask=None):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)  # (batch_size,len_q, dim_q)
        k = self.wk(k)  # (batch_size,len_v, dim_q)
        v = self.wv(v)  # (batch_size,len_v, dim_v)

        q = self.split_heads(q, batch_size)  # (batch_size, len_q, num_heads, depth_q) (m,l,h,d)
        k = self.split_heads(k, batch_size)  # (batch_size, len_v, num_heads, depth_q) (m,j,h,d)
        v = self.split_heads(v, batch_size)  # (batch_size, len_v, num_heads, depth_v) (m,j,h,e)

        qk = tf.einsum("mlhd,mjhd->mljh", q, k)  # (batch_size, len_q, len_v, num_heads) (m,l,j,h)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        qk = qk / tf.math.sqrt(dk)

        if mask is not None:
            qk = qk - mask*1e9 # We are using a multiplicative mask

        qk = tf.nn.softmax(qk, axis=-2)  # (batch_size,len_q,len_v, num_heads) (m,l,j,h)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        qk = qk / tf.math.sqrt(dk)

        output = tf.einsum("mljh, mjhe -> mlhe", qk, v)  # (batch_size,len_q, heads, depth_v)
        output = tf.reshape(output, (batch_size, -1, self.num_heads * self.depth))  # (batch_size,len_q, d_model)

        return self.dense(output)

class EncoderLayer(tf.keras.layers.Layer):
    """The EncoderLayer consists of one MultiHeadAttention layer connected to a FeedForward layer,
    each of these 2 layers have a residual connection."""

    def __init__(self, num_heads, d_model, dense_dim, dropout=0.1):
        super().__init__()

        self.attention = MultiHeadAttention(d_model, num_heads)
        self.dense = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim, activation='relu'),
                                          tf.keras.layers.Dense(d_model)])

        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()

        self.dropout1 = tf.keras.layers.Dropout(dropout)
        self.dropout2 = tf.keras.layers.Dropout(dropout)

    def call(self, x, training, mask):
        out_attention = self.attention(x, x, x, mask)  # (batch_size,seq_len,d_model)
        out_attention = self.dropout1(out_attention, training=training)
        out1 = self.norm1(x + out_attention)  # residual connection (batch_size,seq_len,d_model)

        out_dense = self.dense(out1)  # (batch_size,seq_len,d_model)
        out2 = self.norm2(out1 + out_dense)  # residual conenction (batch_size,seq_len,d_model)
        return out2


class Encoder(tf.keras.layers.Layer):
    """The Encoder consists of EncoderLayer"""

    def __init__(self, num_layers, num_heads, d_model, dense_dim,
                 vocab_size, max_encoding_position, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.positional_encoding = positional_encoding(max_encoding_position, d_model)
        self.encoding_layers = [EncoderLayer(num_heads, d_model, dense_dim, dropout) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)  # (batch_size,input_len,d_model)
        x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.positional_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.encoding_layers[i](x, training, mask)  # (batch_size, input_seq_len, d_model)

        return x
class DecoderLayer(tf.keras.layers.Layer):
    """A decoder layers consists of two MultiHeadAttention, one for the Decoder input, one from Encoder output"""
    def __init__(self, num_heads, d_model, dense_dim, dropout=0.1):
        super().__init__()

        self.attention1 = MultiHeadAttention(d_model, num_heads)
        self.attention2 = MultiHeadAttention(d_model, num_heads)

        self.dense = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim, activation='relu'),
                                          tf.keras.layers.Dense(d_model)])

        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()
        self.norm3 = tf.keras.layers.LayerNormalization()

        self.dropout1 = tf.keras.layers.Dropout(dropout)
        self.dropout2 = tf.keras.layers.Dropout(dropout)
        self.dropout3 = tf.keras.layers.Dropout(dropout)

    def call(self, encoder_out, x, training, forward_mask, padding_mask):

        out_attention1 = self.attention1(x, x, x,
                                         forward_mask)  # (batch_size, seq_len_answer, d_model) -> The return seq_len is the same as that of the first argument of the call.
        out_attention1 = self.dropout1(out_attention1, training=training)
        out1 = self.norm1(x + out_attention1)  # residual connection (batch_size, seq_len_answer, d_model)

        out_attention2 = self.attention2(out1, encoder_out, encoder_out,
                                         padding_mask)  # (batch_size, seq_len_answer, d_model)
        out_attention2 = self.dropout2(out_attention2, training=training)
        out2 = self.norm2(out1 + out_attention2)

        out_dense = self.dense(out2)
        out_dense = self.dropout3(out_dense + out2)

        return out_dense


class Decoder(tf.keras.layers.Layer):
    """The Decoder consists of multiple DecoderLayer"""
    def __init__(self, num_layers, num_heads, d_model, dense_dim,
                 vocab_size, max_encoding_position, dropout=0.1):
        super().__init__()

        self.num_heads = num_heads
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.positional_encoding = positional_encoding(max_encoding_position, d_model)
        self.decoder_layers = [DecoderLayer(num_heads, d_model, dense_dim, dropout) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, encoder_out, x, training, forward_mask=None, padding_mask=None):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)  # (batch_size,input_len,d_model)
        x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.positional_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.decoder_layers[i](encoder_out, x, training, forward_mask,
                                       padding_mask)  # (batch_size, input_seq_len, d_model)
        return x

class Transformer(tf.keras.Model):

    def __init__(self, num_layers, num_heads, d_model, dense_dim, in_vocab_size, tar_vocab_size,
                 input_max_position, target_max_position, rate=0.1):
        super().__init__()

        self.encoder = Encoder(num_layers, num_heads, d_model, dense_dim,
                               in_vocab_size, max_encoding_position=input_max_position, dropout=0.1)

        self.decoder = Decoder(num_layers, num_heads, d_model, dense_dim,
                               tar_vocab_size, max_encoding_position=target_max_position, dropout=0.1)

        self.dense = tf.keras.layers.Dense(tar_vocab_size)

    def call(self, input, target, training=False, enc_mask=None, dec_forward_mask=None, dec_padding_mask=None):
        out_encoder = self.encoder(input, training=training, mask=enc_mask)

        out_decoder = self.decoder(out_encoder, target, training=training, forward_mask=dec_forward_mask,
                                   padding_mask=dec_padding_mask)

        out = self.dense(out_decoder)

        return out

In [26]:
transformer = Transformer(num_layers=num_layers, num_heads=num_heads, d_model=d_model, dense_dim=dense_dim,
                          in_vocab_size=fr_vocab_size, tar_vocab_size=en_vocab_size,
                          input_max_position=max_len_fr, target_max_position=max_len_en, rate=0.1)

In [27]:
def evaluate(question):

    start_token = [tokenizer_fr.vocab_size]
    end_token = [tokenizer_fr.vocab_size + 1]
    question = start_token + tokenizer_fr.encode(question) + end_token
    question = tf.expand_dims(question, 0)
    answer_in = [tokenizer_en.vocab_size]
    answer_in = tf.expand_dims(answer_in, 0)

    for i in range(max_len_fr):
        enc_padding_mask = padding_mask(question)
        dec_padding_mask = padding_mask(question)
        dec_forward_mask = forward_mask(answer_in)

        predictions = transformer(question, answer_in, training=False, enc_mask=enc_padding_mask,
                                  dec_forward_mask=dec_forward_mask, dec_padding_mask=dec_padding_mask)
        prediction = predictions[:, -1, :]  # select the last word to add to the outputs

        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)

        if predicted_id == tokenizer_en.vocab_size + 1:
            return tf.squeeze(answer_in, axis=0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        answer_in = tf.concat([answer_in, predicted_id], axis=-1)

    return tf.squeeze(answer_in, axis=0)


def translate(sentence):
    result = np.array(evaluate(sentence))

    predicted_sentence = tokenizer_en.decode([i for i in result
                                              if tokenizer_en.vocab_size > i > 0])
    print('Input: {}'.format(sentence))
    print('Predicted answer: {}'.format(predicted_sentence))


In [28]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, d_model, warmup_steps=4000):
            super(CustomSchedule, self).__init__()

            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)

            self.warmup_steps = warmup_steps

        def __call__(self, step):
            arg1 = tf.math.rsqrt(step)
            arg2 = step * (self.warmup_steps ** -1.5)

            return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model, warmup_steps=4000)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                         epsilon=1e-9)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

def masked_loss_fn(answer, prediction):
        mask = tf.math.logical_not(tf.math.equal(answer, 0))  # 0 at zeroes, 1 at non-zeroes since seq is padded
        # mask = tf.math.equal(answer, 0)
        mask = tf.cast(mask, tf.int32)
        loss_value = loss_fn(answer, prediction,
                             sample_weight=mask)  # set the zeros to zero weight, other values have weight of 1.

        return loss_value

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')

In [37]:
    signature = [tf.TensorSpec(shape=(None, max_len_fr), dtype=tf.int64),
                 tf.TensorSpec(shape=(None, max_len_en),
                               dtype=tf.int64), ]  # a bit faster if we specify the signature

    @tf.function(input_signature=signature)
    def train_step(question, answer):
        answer_in = answer[:, :-1]
        answer_tar = answer[:, 1:]

        enc_padding_mask = padding_mask(question)
        dec_padding_mask = padding_mask(question)
        dec_forward_mask = forward_mask(answer_in)

        with tf.GradientTape() as tape:
            predictions = transformer(question, answer_in, training=True, enc_mask=enc_padding_mask,
                                      dec_forward_mask=dec_forward_mask, dec_padding_mask=dec_padding_mask)
            loss = masked_loss_fn(answer_tar, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(answer_tar, predictions)

    for epoch in range(EPOCHS):
        start = time.time()

        train_loss.reset_states()
        train_accuracy.reset_states()

        for (batch, (question, answer)) in enumerate(train_dataset):
            train_step(question, answer)

        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                            train_loss.result(),
                                                            train_accuracy.result()))

        print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
        translate("son honneur le president informe le senat que des senateurs attendent a la porte pour etre "
                        "presentes")

Epoch 1 Loss 1.2962 Accuracy 0.2337
Time taken for 1 epoch: 192.4174439907074 secs

Input: son honneur le president informe le senat que des senateurs attendent a la porte pour etre presentes
Predicted answer: the honour the speaker informed the senate that senators are going to be introduced 
Epoch 2 Loss 1.2025 Accuracy 0.2457
Time taken for 1 epoch: 187.0682246685028 secs

Input: son honneur le president informe le senat que des senateurs attendent a la porte pour etre presentes
Predicted answer: the speaker informed the senate that senators are waiting for the senate to be introduced 
Epoch 3 Loss 1.1376 Accuracy 0.2543
Time taken for 1 epoch: 186.88165593147278 secs

Input: son honneur le president informe le senat que des senateurs attendent a la porte pour etre presentes
Predicted answer: his honour informed the senate that senators are waiting for the senate 
Epoch 4 Loss 1.0909 Accuracy 0.2605
Time taken for 1 epoch: 186.60118699073792 secs

Input: son honneur le president inf

In [39]:
transformer.save_weights("transformer")

In [41]:
transformer2 = Transformer(num_layers=num_layers, num_heads=num_heads, d_model=d_model, dense_dim=dense_dim,
                          in_vocab_size=fr_vocab_size, tar_vocab_size=en_vocab_size,
                          input_max_position=max_len_fr, target_max_position=max_len_en, rate=0.1)

In [43]:
transformer.load_weights("transformer")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f02e0e67438>

We compare the translation with the data (preprocessed)

In [52]:
translate("son excellence le gouverneur general etant arrive au senat et ayant pris place sur le trone")


Input: son excellence le gouverneur general etant arrive au senat et ayant pris place sur le trone
Predicted answer: his excellency the governor general came to the senate and having placed on the throne  


In [51]:
print("French: Son Excellence le Gouverneur général étant arrivé au Sénat et ayant pris place sur le trône")
print("English: His Excellency the Governor General having come and being seated upon the Throne")

French: Son Excellence le Gouverneur général étant arrivé au Sénat et ayant pris place sur le trône
English: His Excellency the Governor General having come and being seated upon the Throne
