In [41]:
import os
import numpy as np
import tensorflow as tf
import math
from pathlib import Path
import matplotlib.pyplot as plt

## TPU/GPU Strategy connection

In [None]:
try: 
    # For use with TPU:

    # Detect TPUs
    
    # Locate TPUs on the network
    # tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    
    # TPUStrategy contains the necessary distributed training code that will work on TPUs 
    # with their 8 compute cores
    # strategy = tf.distribute.TPUStrategy(tpu)
    
    # Multi GPU training
    strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0"]) #, "/gpu:1"])

except ValueError: # If TPU or GPU is not available
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU

In [3]:
print(f'Number of accelerators: {strategy.num_replicas_in_sync}')

Number of accelerators: 1


## Global Parameters

In [4]:
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

AUTOTUNE = tf.data.AUTOTUNE

## Initial Data Processing: EN-ES translation task

In [5]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

In [6]:
text = (Path(path).with_name("spa-eng")/"spa.txt").read_text()

In [7]:
text = text.replace('¡','').replace('¿','')

In [8]:
# Translation pairs are separated by tabs
pairs = [line.split('\t') for line in text.splitlines()]

In [9]:
pairs[:10]

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', 'Corre!'],
 ['Run.', 'Corred.'],
 ['Who?', 'Quién?'],
 ['Fire!', 'Fuego!'],
 ['Fire!', 'Incendio!']]

In [10]:
# Inplace shuffling the whole dataset
np.random.shuffle(pairs)

In [11]:
sentences_en, sentences_es = zip(*pairs)

In [12]:
# Embedding based on the first 1000 words
vocab_size = 1000

# Max sentence length counted in tokes, so whole words here
max_length = 50

# Ebedding dimension
embed_size = 128

## Text tokenizer adaptation

## Note: these two layers are for simplicity defined here at the level of dataset preparation
## and used inside the transformer class below

In [13]:
text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)

In [14]:
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [15]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [16]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [17]:
sentences_en[99999:100_000]

('I heard every word.',)

## Initial raw data split and target tokenization

## We leave inputs in natural form to be able to input plain sentences at inference time

In [18]:
len(sentences_en)

118964

In [19]:
TRAIN_CNT = 100000
VALID_CNT = len(sentences_en) - TRAIN_CNT

In [20]:
X_train = sentences_en[:TRAIN_CNT]
X_valid = sentences_en[TRAIN_CNT:]

# Data shifter by one token for teacher forcing
X_train_dec = [f'startofseq {s}' for s in sentences_es[:TRAIN_CNT]]
X_valid_dec = [f'startofseq {s}' for s in sentences_es[TRAIN_CNT:]]

# The last predicted token must indicate sentence end
Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:TRAIN_CNT]]).numpy()
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[TRAIN_CNT:]]).numpy()

In [21]:
X_train_dec[-2:]

['startofseq Deja que llame a Tom.', 'startofseq Lo oí todo.']

In [22]:
Y_train

array([[  7,  14,   1, ...,   0,   0,   0],
       [ 42, 175,  22, ...,   0,   0,   0],
       [ 35,   7,  68, ...,   0,   0,   0],
       ...,
       [171,   5,   8, ...,   0,   0,   0],
       [440,   5,   1, ...,   0,   0,   0],
       [ 17, 738,  55, ...,   0,   0,   0]])

In [23]:
Y_valid

array([[ 35,   1,   6, ...,   0,   0,   0],
       [ 46,   1,   3, ...,   0,   0,   0],
       [ 26, 448,   6, ...,   0,   0,   0],
       ...,
       [ 46,   5,  17, ...,   0,   0,   0],
       [  8,   7,  48, ...,   0,   0,   0],
       [ 20, 395,  78, ...,   0,   0,   0]])

In [24]:
len(Y_valid[0])

50

## TFRecords Dataset

In [25]:
# Directory where tfrecords will be stored
tfrecords_dir = 'tfrecords'

# Train val split

# Defining how many samples will be stored in a single TFRecords file
samples_per_tfrecord = 4096

# Training and validation sets
tfrecords_cnt_trn = TRAIN_CNT // samples_per_tfrecord
tfrecords_cnt_val = VALID_CNT // samples_per_tfrecord

# Adding potential remaining samples into one extra TFRecords file
if tfrecords_cnt_trn % samples_per_tfrecord:
    tfrecords_cnt_trn += 1

if tfrecords_cnt_val % samples_per_tfrecord:
    tfrecords_cnt_val += 1
    
if not os.path.exists(tfrecords_dir):
    os.makedirs(f'{tfrecords_dir}/train')
    os.makedirs(f'{tfrecords_dir}/valid')

In [26]:
# String input sentences
def bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.

    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
    
  return array

In [27]:
def create_example(txt_en, txt_es, label):
    serialized_label = serialize_array(label)
    feature = {
        "encoder_input": bytes_feature(bytes(txt_en, 'utf-8')),
        "decoder_input": bytes_feature(bytes(txt_es, 'utf-8')),
        "label": bytes_feature(serialized_label),
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [28]:
# Decodes example stored in a TFR and returns it as a readable sample
def parse_tfrecord_fn(example):
    feature_spec = {
        "encoder_input": tf.io.FixedLenFeature([], dtype=tf.string),
        "decoder_input": tf.io.FixedLenFeature([], dtype=tf.string),
        "label": tf.io.FixedLenFeature([], dtype=tf.string),
    }
    
    example = tf.io.parse_single_example(example, feature_spec)
    label_array = example['label']
    example['label'] = tf.io.parse_tensor(label_array, out_type=tf.int64) # restore array from byte string
    
    return example

In [29]:
%%time

assert len(X_train) == len(X_train_dec), "Encoder/Decoder datasets don't match"

for tfrec_id in range(tfrecords_cnt_trn):

    txt_en_batch = X_train[tfrec_id * samples_per_tfrecord : (tfrec_id + 1 ) * samples_per_tfrecord]
    txt_es_batch = X_train_dec[tfrec_id * samples_per_tfrecord : (tfrec_id + 1 ) * samples_per_tfrecord]
    Y_train_batch = Y_train[tfrec_id * samples_per_tfrecord : (tfrec_id + 1 ) * samples_per_tfrecord]
    
    with tf.io.TFRecordWriter(
        tfrecords_dir + "/train/tfrecord_%.6i.tfrec" % (tfrec_id)
    ) as writer:
    
        for i in range(len(txt_en_batch)):
    
            example = create_example(txt_en_batch[i], txt_es_batch[i], Y_train_batch[i])
            
            writer.write(example.SerializeToString())

for tfrec_id in range(tfrecords_cnt_val):

    txt_en_batch = X_valid[tfrec_id * samples_per_tfrecord : (tfrec_id + 1 ) * samples_per_tfrecord]
    txt_es_batch = X_valid_dec[tfrec_id * samples_per_tfrecord : (tfrec_id + 1 ) * samples_per_tfrecord]
    Y_val_batch = Y_valid[tfrec_id * samples_per_tfrecord : (tfrec_id + 1 ) * samples_per_tfrecord]
    
    with tf.io.TFRecordWriter(
        tfrecords_dir + "/valid/tfrecord_%.6i.tfrec" % (tfrec_id)
    ) as writer:
    
        for i in range(len(txt_en_batch)):
    
            example = create_example(txt_en_batch[i], txt_es_batch[i], Y_val_batch[i])
            
            writer.write(example.SerializeToString())

CPU times: user 16 s, sys: 1.01 s, total: 17 s
Wall time: 13.6 s


## Inspect parsed dataset samples

In [30]:
raw_dataset = tf.data.TFRecordDataset("tfrecords" + "/valid/tfrecord_000000.tfrec")

In [31]:
parsed_dataset = raw_dataset.map(parse_tfrecord_fn)

In [32]:
parsed_dataset

<_MapDataset element_spec={'decoder_input': TensorSpec(shape=(), dtype=tf.string, name=None), 'encoder_input': TensorSpec(shape=(), dtype=tf.string, name=None), 'label': TensorSpec(shape=<unknown>, dtype=tf.int64, name=None)}>

In [33]:
for instance in parsed_dataset.take(1):
  print()
  print(instance) # print parsed example messages with restored arrays


{'decoder_input': <tf.Tensor: shape=(), dtype=string, numpy=b'startofseq Yo asistir\xc3\xa9 a la conferencia.'>, 'encoder_input': <tf.Tensor: shape=(), dtype=string, numpy=b'I will go to the meeting.'>, 'label': <tf.Tensor: shape=(50,), dtype=int64, numpy=
array([35,  1,  6,  9,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])>}


In [34]:
def prepare_sample(features):
    features['label'].set_shape([50])
    return (features['encoder_input'], features['decoder_input']), features['label']#[tf.newaxis, :]

In [35]:
def get_dataset(filenames, batch_size):
    
    dataset = (
    tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .map(prepare_sample, num_parallel_calls=AUTOTUNE)
        .shuffle(10 * batch_size)
        .repeat()
        .batch(batch_size)
    )
    
    dataset = dataset.prefetch(AUTOTUNE)
    
    return dataset

In [36]:
train_filenames = tf.io.gfile.glob(f'{tfrecords_dir}/train/*.tfrec')
valid_filenames = tf.io.gfile.glob(f'{tfrecords_dir}/valid/*.tfrec')

In [37]:
ds = get_dataset(train_filenames, 1)

In [38]:
ds

<_PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None)), TensorSpec(shape=(None, 50), dtype=tf.int64, name=None))>

In [39]:
next(iter(ds))

((<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Have you ever plucked a chicken?'], dtype=object)>,
  <tf.Tensor: shape=(1,), dtype=string, numpy=
  array([b'startofseq Has desplomado a una gallina alguna vez?'],
        dtype=object)>),
 <tf.Tensor: shape=(1, 50), dtype=int64, numpy=
 array([[129,   1,   6,  18,   1, 194,  70,   3,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>)

In [42]:
steps_per_epoch = math.ceil(TRAIN_CNT / BATCH_SIZE)
validation_steps = math.ceil(VALID_CNT / BATCH_SIZE)

## Datasets creation

In [43]:
train_ds = get_dataset(train_filenames, BATCH_SIZE)
val_ds = get_dataset(valid_filenames, BATCH_SIZE)

## Fixed positional encoing

In [44]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0, "embed_size must be even"
        p, i = np.meshgrid(np.arange(max_length),
                           2 * np.arange(embed_size // 2))
        pos_emb = np.empty((1, max_length, embed_size))
        pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        # Here we use the same values for i as above, since for odd embedding posiitons
        # we use (i-1) as the exponent value, which evaluates to the value i of the even case above
        pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.pos_encodings[:, :batch_max_length]

## The Transformer model

In [46]:
class Transformer(tf.keras.Model):
    def __init__(self, 
                 vocab_size=1000, # Number of tokens
                 embed_size=128, # Text embedding dimension
                 max_length=50, # Max sentence length
                 N=2, # Number of Transformer layers
                 num_heads=8, # Attention heads number
                 dropout_rate=0.1,
                 n_units=128, # Width of dense layers
                **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.max_length = max_length 
        self.N = N
        self.num_heads = num_heads
        self.dropout_rate = dropout_rate
        self.n_units = n_units

        self.encoder_embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embed_size, mask_zero=True)
        self.decoder_embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embed_size, mask_zero=True)
        self.pos_embed_layer = PositionalEncoding(self.max_length, self.embed_size)
        self.attn_layer_enc = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_size, dropout=self.dropout_rate)
        self.norm_layer = tf.keras.layers.LayerNormalization()
        self.add_layer = tf.keras.layers.Add()
        self.dense_enc_0 = tf.keras.layers.Dense(self.n_units, activation="relu")
        self.dense_enc_1 = tf.keras.layers.Dense(self.embed_size)
        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
        self.attn_layer_dec_0 = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_size, dropout=self.dropout_rate)
        self.attn_layer_dec_1 = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_size, dropout=self.dropout_rate)
        self.dense_dec_0 = tf.keras.layers.Dense(self.n_units, activation="relu")
        self.dense_dec_1 = tf.keras.layers.Dense(self.embed_size)
        self.dense_output = tf.keras.layers.Dense(self.vocab_size, activation="softmax")
        
    def call(self, inputs):
        encoder_inputs = inputs[0]
        decoder_inputs = inputs[1]

        # Encoder and decoder inputs tokenization
        # At this point tokenizers are already adapted above
        encoder_input_ids = text_vec_layer_en(encoder_inputs)
        decoder_input_ids = text_vec_layer_es(decoder_inputs)

        encoder_input_ids = tf.cast(encoder_input_ids, tf.float32)
        decoder_input_ids = tf.cast(decoder_input_ids, tf.float32)

        encoder_embeddings = self.encoder_embedding_layer(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding_layer(decoder_input_ids)

        batch_max_len_dec = tf.shape(decoder_embeddings)[1]

        encoder_in = self.pos_embed_layer(encoder_embeddings)
        decoder_in = self.pos_embed_layer(decoder_embeddings)

        encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
            
        # Input data
        Z = encoder_in

        # Encoder block
        for _ in range(self.N):
            skip = Z
            Z = self.attn_layer_enc(Z, value=Z, attention_mask=encoder_pad_mask)
            Z = self.norm_layer(self.add_layer([Z, skip]))           
            skip = Z
            Z = self.dense_enc_0(Z)
            Z = self.dense_enc_1(Z)
            Z = self.dropout(Z)
            Z = self.norm_layer(self.add_layer([Z, skip]))

        decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
        causal_mask = tf.linalg.band_part(
                tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

        encoder_outputs = Z
        Z = decoder_in

        for _ in range(self.N):
            skip = Z
            Z = self.attn_layer_dec_0(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
            Z = self.norm_layer(self.add_layer([Z, skip]))
            skip = Z
            # Cross-Attenion: Query from decoder, Key and Value from Encoder
            Z = self.attn_layer_dec_1(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
            Z = self.norm_layer(self.add_layer([Z, skip]))
            skip = Z
            Z = self.dense_dec_0(Z)
            Z = self.dense_dec_1(Z)
            Z = self.norm_layer(self.add_layer([Z, skip]))

        Y_proba = self.dense_output(Z)

        # Patching wrong weights shape used in loss
        Y_proba._keras_mask = Y_proba._keras_mask[:, :, tf.newaxis]

        return Y_proba


In [47]:
with strategy.scope():
    model = Transformer()
    
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="nadam",
                  metrics=["accuracy"], )
                  #run_eagerly=True)
                  #jit_compile=True)

In [None]:
%%time

history = model.fit(train_ds,
                   steps_per_epoch=steps_per_epoch,
                   epochs=1,
                   validation_data=val_ds,
                   validation_steps=validation_steps,)
                   #verbose=1,)
                   #callbacks=callbacks_list)

## Inference (after just one epoch, no expectations)

In [56]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += " " + predicted_word
    return translation.strip()

In [59]:
translate("I like soccer")



'me gusta el fútbol'