In [1]:
# Importing Necessary Libraries
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import tensorflow_text as tf_text
import matplotlib.pyplot as plt
import numpy as np
import os

2024-03-01 02:45:29.373947: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-01 02:45:29.374114: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-01 02:45:29.585680: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Reading the data
data=pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")

In [3]:
data.shape

(175621, 2)

In [5]:
data.head(10)

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
5,Fire!,Au feu !
6,Help!,À l'aide !
7,Jump.,Saute.
8,Stop!,Ça suffit !
9,Stop!,Stop !


In [6]:
os.cpu_count()

4

We are shuffling the data to reduce the bias occurs in the data and to reduce the overfitting.

In [7]:
# Shuffle dataset

data = data.sample(frac=1).reset_index(drop=True)

In [8]:
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,"At last, they ceased working.",Ils cessèrent enfin de travailler.
1,The campers were hard up for water because the...,Les campeurs étaient à court d'eau parce que l...
2,That's what I'd want.,C'est ce que je voudrais.
3,We don't really know anything about death.,Nous ne savons vraiment rien de la mort.
4,"The higher he rose in social rank, the more mo...",Il est devenu de plus en plus humble alors qu'...


2. **CREATE DATASET**

We are splitting the dataset into 5% testing and 95% training part.

In [9]:
test_pct = 0.05  #testing percentage
n_samples = len(data)
n_test = int(n_samples * test_pct)
n_train = n_samples - n_test

print(f"Total samples: {n_samples}")
print(f"Test samples: {n_test}")
print(f"Train samples: {n_train}")

Total samples: 175621
Test samples: 8781
Train samples: 166840


In [10]:
data.shape

(175621, 2)

We are changing the names of columns for ease.

In [11]:
data.rename(columns={"English words/sentences":"English","French words/sentences":"French"},inplace=True)

Converting the data of both columns into numpy array.

In [12]:
english_text = data["English"].to_numpy()
french_text = data["French"].to_numpy()

In [13]:
data.head()

Unnamed: 0,English,French
0,"At last, they ceased working.",Ils cessèrent enfin de travailler.
1,The campers were hard up for water because the...,Les campeurs étaient à court d'eau parce que l...
2,That's what I'd want.,C'est ce que je voudrais.
3,We don't really know anything about death.,Nous ne savons vraiment rien de la mort.
4,"The higher he rose in social rank, the more mo...",Il est devenu de plus en plus humble alors qu'...


We used zip function to take both the data i.e.,english_text & french_text together by using tensorflow dataset. After that, two variables created which shuffled the data taken from ds with declared buffer and batch size.

In [14]:
BUFFER_SIZE = 1000
BATCH_SIZE = 64

ds = tf.data.Dataset.zip(
    tf.data.Dataset.from_tensor_slices(english_text),
    tf.data.Dataset.from_tensor_slices(french_text)
)

test_raw = ds.take(n_test).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
train_raw = ds.skip(n_test).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) # skip the n_test

Iterate over train_raw and prints the english and french texts in numpy array with batch of only 1.

In [15]:
for english_batch, french_batch in train_raw.take(1):
    print("English")
    print(english_batch[0:5].numpy())
    print("\nFrench")
    print(french_batch[0:5].numpy())

English
[b'I look forward to seeing them this spring.' b'He removed his shirt.'
 b'You were right, too.' b'She is expecting a baby in June.'
 b'Do you listen to the radio at home every day?']

French
[b"J'attends avec impatience de les voir ce printemps."
 b'Il a retir\xc3\xa9 sa chemise.' b'Vous aviez aussi raison.'
 b'Elle attend un b\xc3\xa9b\xc3\xa9 pour juin.'
 b'\xc3\x89coutes-tu tous les jours la radio \xc3\xa0 la maison\xe2\x80\xaf?']


3. **TEXT VECTORIZATION**

Models don't understand text, so we need to find a way to convert words into numbers.

TextVectorization maps each word to an integer. In the process it constructs a vocabulary (dictionary), mapping each word to a unique integer.

The Pythonfunction, tf_lower_and_split_punct, processes text data before vectorization. 

In [16]:
# Preparing vectorizers
def tf_lower_and_split_punct(text):
    
    # French text contains special symbols. Unicode normalization:
    text = tf_text.normalize_utf8(text, 'NFKD')
    # Lowercase
    text = tf.strings.lower(text)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)
    # start and end tokens
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

Preparing english vectorizers.

In [17]:
# maximum amount of words in the vocabulary
max_vocab_size = 50000 

english_vectorizer = TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size, 
    ragged=True, # ragged=True allows variable length input sequences
)

# fit vectorization on training dataset english only
english_vectorizer.adapt(train_raw.map(lambda english, french: english))

Now, we take a glimpse of vectorized representation of words or tokens in english vectorizer.

In [18]:
# vectorize example sentence
example_sentence = "Example sentence"
print(f"Input: {example_sentence}")
print(f"Vectorized: {english_vectorizer(example_sentence)}")

Input: Example sentence
Vectorized: [   2 1618 1056    3]


The reason there are 4 tokens in above output is because there is a \ token at the start and an \ token at the end.

In [19]:
# get vocabulary size
vocab_size = english_vectorizer.vocabulary_size()
print(f"English Vocabulary size: {vocab_size}")

English Vocabulary size: 14175


In [20]:
# get first 10 words in the English vocabulary
print(english_vectorizer.get_vocabulary()[0:10])

['', '[UNK]', '[START]', '[END]', '.', 'i', 'you', 'to', 'the', '?']


Special tokens:

*  '' : Padding
* [UNK] : Unknown token, for words which are not in our vocab.
* [START] : Start token, precedes every sentence
* [END] : End token, succeeds every sentence

Preparing French Vectorizers.

In [21]:
french_vectorizer = TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size, 
    ragged=True, # ragged=True allows variable length input sequences
)

# fit vectorization on training dataset french only
french_vectorizer.adapt(train_raw.map(lambda english, french: french))

In [22]:
# vectorize example sentence
example_sentence = "Comment vas-tu?"
print(f"Input: {example_sentence}")
print(f"Vectorized: {english_vectorizer(example_sentence)}")

Input: Comment vas-tu?
Vectorized: [   2 2237    1    9    3]


In [23]:
# get vocabulary size
vocab_size = french_vectorizer.vocabulary_size()
print(f"French Vocabulary size: {vocab_size}")

French Vocabulary size: 27477


In [24]:
# get first 10 words in the French vocabulary
print(french_vectorizer.get_vocabulary()[0:10])

['', '[UNK]', '[START]', '[END]', '.', 'je', 'de', 'a', '?', 'pas']


Now,let's take a sample example from dataset to test both the vectorizers we are created.

Here,we are checking the mappings that both the vectorizers did from text to integers.

In [25]:
for english_b, french_b in train_raw.take(1):
    english = english_b[0]
    french = french_b[0]
    print("\n\nEnglish (Text)\n")
    print(english)
    print("\n\nEnglish (Tokens)\n")
    print(english_vectorizer(english))
    print("\n\nFrench (Text)\n")
    print(french)
    print("\n\nFrench (Tokens)\n")
    print(french_vectorizer(french))



English (Text)

tf.Tensor(b'I liked what you said at the meeting.', shape=(), dtype=string)


English (Tokens)

tf.Tensor([  2   5 692  30   6 139  44   8 346   4   3], shape=(11,), dtype=int64)


French (Text)

tf.Tensor(b"J'ai aim\xc3\xa9 ce que tu as dit pendant la r\xc3\xa9union.", shape=(), dtype=string)


French (Tokens)

tf.Tensor([  2  26 240  20  10  21 115  64 252  11 392   4   3], shape=(13,), dtype=int64)


**CREATE NEW DATASETS WITH WORD INDICES**

Now, we create the new datasets with the new word mappings(indices).

    Convert english and french to word indices (tokens).
    Extract french_in and french_out from summary.
    The difference between french_in and french_out is that they are shifted by one step relative to eachother, so that at each location the label is the next token.

In [26]:
def process_text(english, french):
    
    english_tok = english_vectorizer(english)
    french_tok = french_vectorizer(french)
    french_tok_in = french_tok[:,:-1]
    french_tok_out = french_tok[:, 1:] 
    return (english_tok, french_tok_in), french_tok_out

train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
test_ds = test_raw.map(process_text, tf.data.AUTOTUNE)

Below code iterates over English-French sentence pairs, printing out the first 10 tokens of each English sentence, French input sentence, and shifted French output sentence.

In [27]:
for (english_tok, french_in), french_out in train_ds.take(1):
    print("\nEnglish tokens:")
    print(english_tok[0, :10].numpy()) 
    print("\nFrench_in tokens:")
    print(french_in[0, :10].numpy())
    print("\nFrench_out tokens (shifted):")
    print(french_out[0, :10].numpy())


English tokens:
[  2  31 670   6   4   3]

French_in tokens:
[  2  14  22 136 586   4]

French_out tokens (shifted):
[ 14  22 136 586   4   3]


As we can see above, the French_out tokens are equivalent to the French_in tokens except they are shifted forward by 1.

This automatically creates labels for us, as each token in French_in is matched to the following token in French_out.

4. **BUILDING UP THE ENCODER-DECODER MODEL**

In [29]:
UNITS = 256

**Encoder**:-

Purpose: Process the english tokens.

Input: English tokens.

Output: English encodings.

Following steps to be done:-

1.Convert English tokens to word embeddings.

2.Feed embeddings through Bi-directional RNN.

3.Return final English encodings.

In [30]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vectorizer, units):
        super(Encoder, self).__init__()
        self.vectorizer = vectorizer
        self.vocab_size = vectorizer.vocabulary_size()
        self.units = units
        
        # The embedding layer converts tokens into vectors
        self.embedding = tf.keras.layers.Embedding(
            input_dim=self.vocab_size,
            output_dim=units,
        )
        
        # The RNN layer processes those vectors sequentially
        self.rnn = tf.keras.layers.Bidirectional(
            merge_mode='sum', # sum forward and backward activation
            layer=tf.keras.layers.GRU(
                units,
                return_sequences=True,
                recurrent_initializer='glorot_uniform'
            )
        )
    
    def call(self, x):
        # 1. The embedding layer looks up the embedding vector for each token.
        x = self.embedding(x)
        # 2. The GRU processes the sequence of embeddings.
        x = self.rnn(x)
        # 3. Return the new sequence of embeddings.
        return x
    
    def encode_text(self, texts):
        """
        Converts a list of english texts into encodings
        """
        texts = tf.convert_to_tensor(texts)
        if len(texts.shape) == 0:
            texts = tf.convert_to_tensor(texts)[tf.newaxis]
        tokens = self.vectorizer(texts).to_tensor()
        encodings = self(tokens)
        return encodings

In [32]:
encoder = Encoder(english_vectorizer, UNITS)

# pass example english tokens
english_enc = encoder(english_tok)

print(f'english tokens, shape (batch, s): {english_tok.shape}')
print(f'english encodings, shape (batch, s, units): {english_enc.shape}')

english tokens, shape (batch, s): (64, None)
english encodings, shape (batch, s, units): (64, None, 256)


The reason that the shapes contain None is because each sentence has a variable length.

**Cross-Attention**:-

Purpose: The attention layer lets the decoder access the information extracted by the encoder. It essentially computes contextually aware word embeddings.

Inputs: English encodings

Outputs: Attention vectors (contextually aware English encodings)

Steps we are taken:-

1.Compute Multi-head Attention.

2.Add Skip Connection.

3.Layer Normalization.

4.Return Attention vectors.

In [33]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, french_enc, english_enc):
        # compute attention vectors
        attn_output, attn_scores = self.mha(
            query=french_enc, # query: french encodings
            value=english_enc, # value: condition on english encodings
            return_attention_scores=True)
        
        # skip connection to preserve input signals
        x = self.add([french_enc, attn_output])
        # layer normalization
        x = self.layernorm(x)

        return x

In [34]:
attention_layer = CrossAttention(UNITS)

# simulate French embeddings
embed = tf.keras.layers.Embedding(french_vectorizer.vocabulary_size(),
                                  output_dim=UNITS)
french_embed = embed(french_in)

# pass French embeddings and English encodings
result = attention_layer(french_embed, english_enc)

print(f'English encodings, shape (batch, s, units): {english_enc.shape}')
print(f'French embeddings, shape (batch, t, units): {french_embed.shape}')
print(f'Attention result, shape (batch, t, units): {result.shape}')

English encodings, shape (batch, s, units): (64, None, 256)
French embeddings, shape (batch, t, units): (64, None, 256)
Attention result, shape (batch, t, units): (64, None, 256)


**Decoder**:-

Purpose: Predict the next token given an input sequence.

Inputs: English encodings, French input tokens.

Outputs: Logit predictions for next tokens.

Steps we are taken:-

1.Convert French tokens to word embeddings.

2.Feed word embeddings through Uni-directional RNN.

3.Use RNN output as Query for Cross-Attention on English encodings.

4.Generate logit predictions for next token.

In [35]:
class Decoder(tf.keras.layers.Layer):
    @classmethod
    def add_method(cls, fun):
        """
        This will allows us to add additional methods to the class later.
        """
        setattr(cls, fun.__name__, fun)
        return fun
    
    def __init__(self, vectorizer, units):
        super(Decoder, self).__init__()
        self.vectorizer = vectorizer
        self.vocab_size = vectorizer.vocabulary_size()
        
        self.word_to_id = tf.keras.layers.StringLookup(
            vocabulary=vectorizer.get_vocabulary(),
            mask_token="", oov_token="[UNK]"
        )
        
        self.id_to_word = tf.keras.layers.StringLookup(
            vocabulary=vectorizer.get_vocabulary(),
            mask_token="", oov_token="[UNK]",
            invert=True
        )
        
        self.start_token = self.word_to_id("[START]")
        self.end_token = self.word_to_id("[END]")

        # 1. The embedding layer converts token indices to vectors
        self.units = units
        self.embedding = tf.keras.layers.Embedding(
            self.vocab_size,
            units,
        )

        # 2. The RNN keeps track of what's been generated so far
        self.rnn = tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer="glorot_uniform",
        )
        
        # 3. The RNN output will be the query for the attention layer
        self.attention = CrossAttention(units)
        
        # 4. This fully connected layer produces the logits for each output token
        self.output_layer = tf.keras.layers.Dense(self.vocab_size)
        
    def call(
            self, 
            english_enc, 
            french_in, 
            state=None, 
            return_state=False):
        
        # 1. Convert french tokens to embeddings
        x = self.embedding(french_in)
        
        # 2. Process the french embeddings
        x, state = self.rnn(x, initial_state=state)
        
        # 3. Use the RNN output as the query for the attention over the english encodings
        # Essentially condition the french encodings on the english encodings
        x = self.attention(x, english_enc)
        
        # 4. Generate logit predictions for the next token
        logits = self.output_layer(x)
        
        if return_state:
            return logits, state,
        else:
            return logits

In [36]:
decoder = Decoder(french_vectorizer, UNITS)

# use example English encodings and French input tokens
logits = decoder(english_enc, french_in)

print(f'English encodings shape (encoder output and decoder input): (batch, s, units) {english_enc.shape}')
print(f'French input tokens shape (decoder input): (batch, t) {french_in.shape}')
print(f'Logits shape (decoder output): (batch, french_vocabulary_size) {logits.shape}')

English encodings shape (encoder output and decoder input): (batch, s, units) (64, None, 256)
French input tokens shape (decoder input): (batch, t) (64, None)
Logits shape (decoder output): (batch, french_vocabulary_size) (64, None, 27477)


Now, we are ready for training this network but for inference we are going to add a couple more methods.

In [37]:
@Decoder.add_method
def get_initial_state(self, english_encodings):
    batch_size = tf.shape(english_encodings)[0]
    # create tensor of n=batch_size start tokens [START]
    start_tokens = tf.fill([batch_size, 1], self.start_token)
    done = tf.zeros([batch_size, 1], dtype=tf.bool)
    embedded = self.embedding(start_tokens)
    return start_tokens, done, self.rnn.get_initial_state(embedded)[0]

In [38]:
@Decoder.add_method
def tokens_to_text(self, tokens):
    """
    Convert tokens (word indices) to text
    """
    words = self.id_to_word(tokens)
    result = tf.strings.reduce_join(words, axis=-1, separator=' ')
    result = tf.strings.regex_replace(result, '^ *\[START\] *', '')
    result = tf.strings.regex_replace(result, ' *\[END\] *$', '')
    return result

In [39]:
@Decoder.add_method
def get_next_token(
        self, 
        english_encodings, 
        next_token, 
        done, 
        state, 
        temperature=0.0):
    """
    Note: Temperature is a hyperparameter that regulates the randomness or creativity of the AI's responses in language models.
    """
    # running self() automatically runs the call() method
    logits, state = self(
        english_encodings,
        next_token,
        state=state,
        return_state=True
    )
    
    if temperature == 0.00:
        next_token = tf.argmax(logits, axis=-1)
    else:
        logits = logits[:, -1, :]/temperature
        next_token = tf.random.categorical(logits, num_samples=1)
        
    # if a sequence produces an end_token, set it "done"
    done = done | (next_token == self.end_token)
    # once a sequence is done it only produces 0-padding
    next_token = tf.where(done, tf.constant(0, dtype=tf.int64), next_token)
    
    return next_token, done, state

With these extra functions, we can write a generation loop.

In [40]:
next_token, done, state = decoder.get_initial_state(english_enc)
tokens = []

for n in range(10):
    # run one step
    next_token, done, state = decoder.get_next_token(
        english_enc, next_token, done, state, temperature=1.0
    )
    # add the token to the output
    tokens.append(next_token)

# stack all the tokens together
tokens = tf.concat(tokens, axis=-1) # (batch, t)

# Convert the tokens back to strings
result = decoder.tokens_to_text(tokens)
result

<tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'mimportune mes jattendis realisable regrettez mintimidez muffins oceanographe eastwood presidente',
       b'symptomes tuba limpression canaux boistu frapperais damis prennentils branches eloignees',
       b'debat impudent cuit leurrons lancien allongiez verticales doreilles jules economise',
       b'animes precieuse maccusezvous gobe rattachee abaisser cuismoi bigamie laccompagner presque',
       b'allumettes projeter er explosifs kg ajoutezy projection exprimait personnalite reconquerir',
       b'demarrera necessiter perdirent aviezvous agression preferee donnera joignions forum bu',
       b'probabilites impresario discutezvous comportiez diego amenee appliqua surmene otages devines',
       b'finissent hypothecaire sortes vertus nelevez trente derangeait dinacheve representezvous heureuse',
       b'malbouffe lempechait cents voyageaient precise datteindre suiveur parapet batir floutee',
       b'evident concentra frequence

Now the model is untrained, so the outputs are uniformly random items from the vocabulary.

**Combining Encoder and Decoder into Translator**:-

Purpose: Translate English to French.

Inputs: English tokens, French input tokens.

Outputs: French translation.

Steps we are taken:-

1.Feed English tokens through Encoder, generate English encodings.

2.Feed English encodings and French input tokens to Decoder, generate prediction logits.

In [41]:
class Translator(tf.keras.Model):
    @classmethod
    def add_method(cls, fun):
        setattr(cls, fun.__name__, fun)
        return fun

    def __init__(self, units, english_vectorizer, french_vectorizer):
        super().__init__()
        # build the encoder and decoder
        encoder = Encoder(english_vectorizer, units)
        decoder = Decoder(french_vectorizer, units)
        
        self.encoder = encoder
        self.decoder = decoder
        
    def call(self, inputs):
        # extract english tokens and french input tokens
        english_tok, french_in = inputs
        # convert english tokens to encodings
        english_enc = self.encoder(english_tok)
        # compute logits from english encodings and french input tokens
        logits = self.decoder(english_enc, french_in)
        return logits

In [42]:
model = Translator(UNITS, english_vectorizer, french_vectorizer)

# pass English tokens and French input tokens
logits = model((english_tok, french_in))

print(f'English tokens shape (encoder input): (batch, s, units) {english_tok.shape}')
print(f'English encodings shape (encoder output and decoder input): (batch, s, units) {english_enc.shape}')
print(f'French tokens shape (decoder input): (batch, t) {french_in.shape}')
print(f'Logits shape (decoder output): (batch, french_vocabulary_size) {logits.shape}')

English tokens shape (encoder input): (batch, s, units) (64, None)
English encodings shape (encoder output and decoder input): (batch, s, units) (64, None, 256)
French tokens shape (decoder input): (batch, t) (64, None)
Logits shape (decoder output): (batch, french_vocabulary_size) (64, None, 27477)


5. **MODEL TRAINING**

For training, we need to implement our own masked loss and accuracy functions:

In [43]:
def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)

    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [44]:
def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match)/tf.reduce_sum(mask)

Finalize the configuration of the model for training

In [45]:
model.compile(optimizer='adam',
              loss=masked_loss, 
              metrics=[masked_acc, masked_loss])

Calculating some expected values related to a vocab size

In [46]:
# Results in floating point number
vocab_size = 1.0 * french_vectorizer.vocabulary_size()

{
    "expected_loss": tf.math.log(vocab_size).numpy(),
    "expected_acc": 1/vocab_size
}

{'expected_loss': 10.221105, 'expected_acc': 3.639407504458274e-05}

Above values should roughly match the values returned by running a few steps of evaluation:

In [47]:
model.evaluate(test_ds, steps=20,return_dict=True)



{'loss': 10.220355033874512,
 'masked_acc': 0.019607946276664734,
 'masked_loss': 5618.1396484375}

In [48]:
history = model.fit(
    train_ds.repeat(), # .repeat() makes it an infinite dataset
    validation_data=test_ds,
    epochs=20,
    steps_per_epoch = 100, # since we are using an infinite dataset, we need to specify the number of steps per epoch
    validation_steps = 20,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3)
    ]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


**INFERENCE** (Make Predictions from model on unseen data)

In [49]:
@Translator.add_method
def translate(self,
              texts, *,
              max_length=50,
              temperature=0.0):
    # Process the input texts
    context = self.encoder.encode_text(texts)
    batch_size = tf.shape(texts)[0]

    # Setup the loop inputs
    tokens = []
    next_token, done, state = self.decoder.get_initial_state(context)

    for _ in range(max_length):
        # Generate the next token
        next_token, done, state = self.decoder.get_next_token(context, next_token, done,  state, temperature)

        # Collect the generated tokens
        tokens.append(next_token)

        if tf.executing_eagerly() and tf.reduce_all(done):
            break

    # Stack the lists of tokens and attention weights.
    tokens = tf.concat(tokens, axis=-1)   # t*[(batch 1)] -> (batch, t)

    result = self.decoder.tokens_to_text(tokens)
    return result

In [55]:
result = model.translate(["Hi!"]) 
result[0].numpy().decode()

'salut ! '