In [6]:
import math
import os
import gc
import time
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import backend as K
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
SRC_TEXT_COLUMN = 'source_text'
TGT_TEXT_COLUMN = 'translated_text'
DATASET_SIZE = 80000  
VOCABULARY_LIMIT = 2**14

TRAIN_BATCH_SIZE = 64  # samples per training batch
TRAIN_EPOCHS = 10 # training iterations
SEQ_MAX_LENGTH = 15  # Maximum sequence length

MODEL_CHECKPOINT_DIR = "model_saves/"

In [9]:
from datasets import load_dataset

# Load the translation dataset
translation_data = load_dataset('Helsinki-NLP/tatoeba_mt', 'ara-eng', trust_remote_code=True)

# Splitting dataset into training and testing sets
validation_set = translation_data['validation']
test_set = translation_data['test']

# Extract Arabic and English texts
source_texts = [sample['sourceString'] for sample in validation_set]
translated_texts = [sample['targetString'] for sample in validation_set]


README.md:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

tatoeba_mt.py:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tatoeba-test.ara-eng.tsv:   0%|          | 0.00/938k [00:00<?, ?B/s]

tatoeba-dev.ara-eng.tsv:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10304 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19528 [00:00<?, ? examples/s]

In [10]:
def apply_subword_tokenization(text_corpus, vocab_limit, sequence_max_length):
    # Create the vacabulary using subword tokenization
    subword_tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        text_corpus, target_vocab_size=vocab_limit)

    # Define the vocabulary size, including special tokens
    total_vocab_size = subword_tokenizer.vocab_size + 2

    # Define Start of Sequence (SOS) and End of Sequence (EOS)
    sos_marker = [total_vocab_size - 2]
    eos_marker = [total_vocab_size - 1]

    # Tokenize each sentence in the corpus
    tokenized_sentences = [sos_marker + subword_tokenizer.encode(sentence) + eos_marker
                           for sentence in text_corpus]

    # Identify indices of sentences exceeding the maximum length
    exceeding_indices = [index for index, sent in enumerate(tokenized_sentences)
                         if len(sent) > sequence_max_length]

    # Pad tokenized sequences to ensure uniform length
    tokenized_sentences = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_sentences, value=0, padding='post', maxlen=sequence_max_length)

    return tokenized_sentences, subword_tokenizer, total_vocab_size, sos_marker, eos_marker, exceeding_indices


In [11]:
# Tokenize and pad the input sequences 
encoded_source_texts, source_tokenizer, source_vocab_size, start_token_source, end_token_source, removal_indices_source = apply_subword_tokenization(
    source_texts, VOCABULARY_LIMIT, SEQ_MAX_LENGTH)

# Tokenize and pad the output sequences
encoded_target_texts, target_tokenizer, target_vocab_size, start_token_target, end_token_target, removal_indices_target = apply_subword_tokenization(
    translated_texts, VOCABULARY_LIMIT, SEQ_MAX_LENGTH)


In [12]:
# Verifying the tokenization process
print(encoded_source_texts[:7], start_token_source, end_token_source)
print(encoded_target_texts[:7], start_token_target, end_token_target)

[[11136  4556 10912  1964  6615     4 11137     0     0     0     0     0
      0     0     0]
 [11136  8356    82  1038  8570   721  5420   572 10926 11137     0     0
      0     0     0]
 [11136   721  6440  5134  1069   743   737  7961  6709 10926 11137     0
      0     0     0]
 [11136     7  1860   862   200 10926 11137     0     0     0     0     0
      0     0     0]
 [11136  7838   508  4876  1762  1944 10376    42  2330 10926 11137     0
      0     0     0]
 [11136    29  2031  5531 10926 11137     0     0     0     0     0     0
      0     0     0]
 [11136  4261    39  2483  3658   151 10926 11137     0     0     0     0
      0     0     0]] [11136] [11137]
[[11223   236     7   317    90     2  3103 11030 11224     0     0     0
      0     0     0]
 [11223    41   279    18    44    13  2418    18    43   111    35  6910
  11013 11224     0]
 [11223    27   126   373   322   157   163   330  1084 11013 11224     0
      0     0     0]
 [11223    88    36 11006     9  

In [13]:
print('Size of Source Vocabulary: ', source_vocab_size)
print('Size of Target Vocabulary: ', target_vocab_size)


Size of Source Vocabulary:  11138
Size of Target Vocabulary:  11225


In [14]:
# Defining a TensorFlow dataset
data_pipeline = tf.data.Dataset.from_tensor_slices(
    (encoded_source_texts, encoded_target_texts))

data_pipeline = data_pipeline.shuffle(len(source_texts), reshuffle_each_iteration=True).batch(
    TRAIN_BATCH_SIZE, drop_remainder=True)

# Optimizing dataset performance
data_pipeline = data_pipeline.prefetch(tf.data.experimental.AUTOTUNE)

In [15]:
def compute_scaled_attention(q_vectors, k_vectors, v_vectors, attention_mask):
    # Compute the dot product of queries and keys
    score_matrix = tf.matmul(q_vectors, k_vectors, transpose_b=True)

    # Determine scaling factor
    scaling_factor = tf.cast(tf.shape(k_vectors)[-1], tf.float32)

    # Scale the score matrix
    scaled_scores = score_matrix / tf.math.sqrt(scaling_factor)

    # Apply masking if provided
    if attention_mask is not None:
        scaled_scores += (attention_mask * -1e9)

    # Compute attention-weighted values
    attention_output = tf.matmul(tf.nn.softmax(scaled_scores, axis=-1), v_vectors)

    return attention_output


In [16]:
class MultiHeadSelfAttention(layers.Layer):

    def __init__(self, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.num_heads = num_heads

    def build(self, input_dims):
        self.model_dim = input_dims[-1]
        assert self.model_dim % self.num_heads == 0, "Model dimension must be divisible by number of heads."

        # Calculate head dimension
        self.head_dim = self.model_dim // self.num_heads

        # Linear transformations for Q, K, V
        self.query_transform = layers.Dense(units=self.model_dim)
        self.key_transform = layers.Dense(units=self.model_dim)
        self.value_transform = layers.Dense(units=self.model_dim)

        # Final linear transformation after multi-head attention
        self.output_transform = layers.Dense(units=self.model_dim)

    def split_heads(self, tensor_input, batch_size):
        reshaped_tensor = tf.reshape(tensor_input, shape=(batch_size, -1, self.num_heads, self.head_dim))
        return tf.transpose(reshaped_tensor, perm=[0, 2, 1, 3])  # (batch_size, num_heads, seq_length, head_dim)

    def call(self, q_input, k_input, v_input, attention_mask):
        batch_size = tf.shape(q_input)[0]

        # Compute Q, K, and V matrices
        q_matrix = self.query_transform(q_input)
        k_matrix = self.key_transform(k_input)
        v_matrix = self.value_transform(v_input)

        # Split into multiple heads
        q_matrix = self.split_heads(q_matrix, batch_size)
        k_matrix = self.split_heads(k_matrix, batch_size)
        v_matrix = self.split_heads(v_matrix, batch_size)

        # Compute attention using scaled dot-product function
        attention_scores = compute_scaled_attention(q_matrix, k_matrix, v_matrix, attention_mask)

        # Rearrange dimensions
        attention_scores = tf.transpose(attention_scores, perm=[0, 2, 1, 3])

        # Concatenate multiple heads
        concatenated_attention = tf.reshape(attention_scores, shape=(batch_size, -1, self.model_dim))

        # Final transformation
        final_output = self.output_transform(concatenated_attention)

        return final_output


In [17]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def compute_angles(self, position, index, model_dim):
        # Compute the angle rates for different dimensions
        angle_rates = 1 / np.power(10000., (2 * (index // 2)) / np.float32(model_dim))
        return position * angle_rates  # Shape: (sequence_length, model_dim)

    def call(self, inputs):
        # Extract sequence length and model dimension
        sequence_length = inputs.shape[-2]
        model_dim = inputs.shape[-1]

        # Compute the angles using broadcasting
        angle_matrix = self.compute_angles(
            np.arange(sequence_length)[:, np.newaxis],  # Position indices
            np.arange(model_dim)[np.newaxis, :],       # Dimension indices
            model_dim
        )

        # Apply sine to even indices and cosine to odd indices
        angle_matrix[:, 0::2] = np.sin(angle_matrix[:, 0::2])
        angle_matrix[:, 1::2] = np.cos(angle_matrix[:, 1::2])

        # Expand for batch compatibility
        pos_encoding = angle_matrix[np.newaxis, ...]

        return inputs + tf.cast(pos_encoding, dtype=tf.float32)


In [18]:
class EncoderLayer(layers.Layer):
    def __init__(self, ffn_units, num_heads, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.ffn_units = ffn_units  # Feed Forward Network units
        self.num_heads = num_heads  # Number of attention heads
        self.dropout_rate = dropout_rate  # Dropout rate

        # Multi-head Attention layer
        self.multi_head_attention = MultiHeadSelfAttention(self.num_heads)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)  # Layer Norm 1

        # Feed Forward Network (FFN)
        self.ffn_dense1 = layers.Dense(units=self.ffn_units, activation="relu")
        self.ffn_dense2 = layers.Dense(units=None)  # Output size is determined in `build`
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)  # Layer Norm 2

    def build(self, input_shape):
        self.d_model = input_shape[-1]  # Get embedding dimension dynamically
        self.ffn_dense2.units = self.d_model  # Set FFN output units dynamically

    def call(self, inputs, mask=None, training=False):
        # Multi-head self-attention
        attention_output = self.multi_head_attention(inputs, inputs, inputs, mask)
        attention_output = self.dropout_1(attention_output, training=training)
        attention_output = self.norm_1(inputs + attention_output)  # Add & Norm

        # Feed Forward Network
        ffn_output = self.ffn_dense1(attention_output)
        ffn_output = self.ffn_dense2(ffn_output)
        ffn_output = self.dropout_2(ffn_output, training=training)
        ffn_output = self.norm_2(attention_output + ffn_output)  # Add & Norm

        return ffn_output


In [19]:
class Encoder(layers.Layer):
    def __init__(self, n_layers, ffn_units, num_heads, dropout_rate, vocab_size, d_model, name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.n_layers = n_layers
        self.d_model = d_model

        # Embedding and Positional Encoding
        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)

        # Stacking multiple Encoder Layers
        self.enc_layers = [
            EncoderLayer(ffn_units, num_heads, dropout_rate) for _ in range(n_layers)
        ]

    def call(self, inputs, mask=None, training=False):
        # Convert tokens into embeddings
        embeddings = self.embedding(inputs)
        embeddings *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # Scale embeddings
        embeddings = self.pos_encoding(embeddings)  # Add positional encodings
        embeddings = self.dropout(embeddings, training=training)

        # Pass through stacked encoder layers
        output = embeddings
        for layer in self.enc_layers:
            output = layer(output, mask, training=training)

        return output


In [20]:
class DecoderLayer(layers.Layer):
    def __init__(self, ffn_units, num_heads, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.ffn_units = ffn_units
        self.num_heads = num_heads
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.d_model = input_shape[-1]

        # Self-attention (masked causal attention)
        self.self_attention = MultiHeadSelfAttention(self.num_heads)
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        # Encoder-decoder attention
        self.enc_dec_attention = MultiHeadSelfAttention(self.num_heads)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        # Feedforward network
        self.ffn1 = layers.Dense(units=self.ffn_units, activation="relu")
        self.ffn2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, enc_outputs, mask_1=None, mask_2=None, training=False):
        # Masked self-attention (causal attention)
        attn_output = self.self_attention(inputs, inputs, inputs, mask_1)
        attn_output = self.dropout_1(attn_output, training=training)
        attn_output = self.norm_1(attn_output + inputs)  # Residual connection

        # Encoder-decoder attention
        attn_output_2 = self.enc_dec_attention(attn_output, enc_outputs, enc_outputs, mask_2)
        attn_output_2 = self.dropout_2(attn_output_2, training=training)
        attn_output_2 = self.norm_2(attn_output_2 + attn_output)  # Residual connection

        # Feedforward network
        ffn_output = self.ffn1(attn_output_2)
        ffn_output = self.ffn2(ffn_output)
        ffn_output = self.dropout_3(ffn_output, training=training)
        output = self.norm_3(ffn_output + attn_output_2)  # Residual connection

        return output


In [21]:
class Decoder(layers.Layer):
    def __init__(self,
                 n_layers,
                 ffn_units,
                 num_heads,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.n_layers = n_layers

        # Embedding layer
        self.embedding = layers.Embedding(vocab_size, d_model)

        # Positional encoding
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)

        # Stacked Decoder layers
        self.dec_layers = [DecoderLayer(ffn_units, num_heads, dropout_rate)
                           for _ in range(n_layers)]

    def call(self, inputs, enc_outputs, mask_1=None, mask_2=None, training=False):
        # Compute embeddings and scale
        x = self.embedding(inputs)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        # Apply positional encoding
        x = self.pos_encoding(x)
        x = self.dropout(x, training=training)

        # Pass through stacked decoder layers
        for layer in self.dec_layers:
            x = layer(x, enc_outputs, mask_1, mask_2, training=training)

        return x


In [22]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 n_layers,
                 ffn_units,
                 num_heads,
                 dropout_rate,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)

        # Encoder
        self.encoder = Encoder(n_layers, ffn_units, num_heads, dropout_rate, vocab_size_enc, d_model)

        # Decoder
        self.decoder = Decoder(n_layers, ffn_units, num_heads, dropout_rate, vocab_size_dec, d_model)

        # Final linear layer
        self.final_layer = layers.Dense(units=vocab_size_dec, name="output_layer")

    def create_padding_mask(self, seq):
        """Create a mask for padding tokens (0s)."""
        return tf.cast(tf.math.equal(seq, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        """Create a look-ahead mask for causal attention."""
        seq_len = tf.shape(seq)[1]
        return 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)

    def call(self, enc_inputs, dec_inputs, training=False):
        """Forward pass through the Transformer model."""
        # Create padding masks
        enc_padding_mask = self.create_padding_mask(enc_inputs)
        dec_padding_mask = self.create_padding_mask(enc_inputs)  # Encoder-decoder attention

        # Create look-ahead mask
        look_ahead_mask = self.create_look_ahead_mask(dec_inputs)
        combined_mask = tf.maximum(self.create_padding_mask(dec_inputs), look_ahead_mask)

        # Pass through encoder
        enc_outputs = self.encoder(enc_inputs, enc_padding_mask, training=training)

        # Pass through decoder
        dec_outputs = self.decoder(dec_inputs, enc_outputs, combined_mask, dec_padding_mask, training=training)

        # Final output projection
        return self.final_layer(dec_outputs)


In [23]:
def loss_function(target, pred):
    """Computes masked loss for sequence modeling."""
    mask = tf.math.logical_not(tf.math.equal(target, 0))  # Ignore padding (0s)

    loss = loss_object(target, pred)  # Compute loss using the defined loss function
    mask = tf.cast(mask, dtype=loss.dtype)  # Convert mask to match loss dtype
    loss *= mask  # Apply the mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask)  # Normalize by non-masked elements


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Custom learning rate schedule following the Transformer paper."""

    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        """Computes the learning rate for a given step."""
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [24]:
import time
import tensorflow as tf

def main_train(dataset, transformer, n_epochs, print_every=50):
    """Train the Transformer model for n_epochs using the dataset."""

    losses = []
    accuracies = []

    for epoch in range(1, n_epochs + 1):  # 1-based indexing for better readability
        print(f"Starting epoch {epoch}")
        start_time = time.time()

        # Reset loss and accuracy for each epoch
        train_loss.reset_state()
        train_accuracy.reset_state()

        for batch, (enc_inputs, targets) in enumerate(dataset):
            # Prepare decoder inputs and outputs
            dec_inputs = targets[:, :-1]  # Remove last token for input
            dec_outputs_real = targets[:, 1:]  # Shift right for expected output

            with tf.GradientTape() as tape:
                predictions = transformer(enc_inputs, dec_inputs, training=True)
                loss = loss_function(dec_outputs_real, predictions)

            # Compute gradients and apply optimizer
            gradients = tape.gradient(loss, transformer.trainable_variables)
            optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

            # Update metrics
            train_loss(loss)
            train_accuracy(dec_outputs_real, predictions)

            # Logging at intervals
            if batch % print_every == 0:
                losses.append(train_loss.result().numpy())
                accuracies.append(train_accuracy.result().numpy())
                print(f"Epoch {epoch} Batch {batch} Loss: {train_loss.result():.4f} Accuracy: {train_accuracy.result():.4f}")

        # Epoch summary
        epoch_time = time.time() - start_time
        print(f"Epoch {epoch} completed in {epoch_time:.2f} seconds.\n")

    return losses, accuracies


In [25]:
# Set hyperparamters for the model
D_MODEL = 512 # 512
N_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
N_HEADS = 8 # 8
DROPOUT_RATE = 0.1 # 0.1



In [29]:
# Clean the session
tf.keras.backend.clear_session()

# Create the Transformer model
transformer = Transformer(
    vocab_size_enc=source_vocab_size,
    vocab_size_dec=target_vocab_size,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    ffn_units=FFN_UNITS,
    num_heads=N_HEADS,
    dropout_rate=DROPOUT_RATE
)
 

# Define loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)

# Define metrics for tracking training progress
train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

# Create a learning rate scheduler
learning_rate = CustomSchedule(D_MODEL)

# Define the Adam optimizer with decay and numerical stability tweaks
optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)


In [38]:
# Train the Transformer model
losses, accuracies = main_train(data_pipeline, transformer, TRAIN_EPOCHS, print_every=100)

Starting epoch 1
Epoch 1 Batch 0 Loss: 2.7004 Accuracy: 0.3225
Epoch 1 Batch 100 Loss: 2.8332 Accuracy: 0.3154
Epoch 1 Batch 200 Loss: 2.8360 Accuracy: 0.3144
Epoch 1 Batch 300 Loss: 2.8401 Accuracy: 0.3149
Epoch 1 completed in 350.68 seconds.

Starting epoch 2
Epoch 2 Batch 0 Loss: 2.5082 Accuracy: 0.3426
Epoch 2 Batch 100 Loss: 2.5152 Accuracy: 0.3376
Epoch 2 Batch 200 Loss: 2.5285 Accuracy: 0.3366
Epoch 2 Batch 300 Loss: 2.5464 Accuracy: 0.3362
Epoch 2 completed in 355.59 seconds.

Starting epoch 3
Epoch 3 Batch 0 Loss: 2.2536 Accuracy: 0.3661
Epoch 3 Batch 100 Loss: 2.2321 Accuracy: 0.3594
Epoch 3 Batch 200 Loss: 2.2573 Accuracy: 0.3573
Epoch 3 Batch 300 Loss: 2.2783 Accuracy: 0.3564
Epoch 3 completed in 354.70 seconds.

Starting epoch 4
Epoch 4 Batch 0 Loss: 1.7051 Accuracy: 0.3951
Epoch 4 Batch 100 Loss: 1.9418 Accuracy: 0.3824
Epoch 4 Batch 200 Loss: 2.0023 Accuracy: 0.3765
Epoch 4 Batch 300 Loss: 2.0463 Accuracy: 0.3738
Epoch 4 completed in 353.95 seconds.

Starting epoch 5
Epo

In [39]:
def predict(inp_sentence, tokenizer_in, tokenizer_out, target_max_len):
    # Tokenize and add special tokens
    inp_sentence = start_token_source + tokenizer_in.encode(inp_sentence) + end_token_source
    enc_input = tf.expand_dims(inp_sentence, axis=0)

    # Initialize decoder input with SOS token
    out_sentence = start_token_target
    output = tf.expand_dims(out_sentence, axis=0)

    for _ in range(target_max_len):
        # Get predictions from Transformer
        predictions = transformer(enc_input, output, training=False)

        # Get the highest probability token
        prediction = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)

        # Check for EOS token and break
        if predicted_id == end_token_target:
            return tf.squeeze(output, axis=0)

        # Append predicted token to output
        output = tf.concat([output, predicted_id], axis=-1)

    # Decode token IDs to text
    return tf.squeeze(output, axis=0)


In [40]:
def translate(sentence):
    # Get the predicted sequence for the input sentence
    output = predict(sentence, source_tokenizer, target_tokenizer,SEQ_MAX_LENGTH)

    # Convert token IDs back to text, stopping at the EOS token
    predicted_sentence = target_tokenizer.decode(
        [i for i in output if i < start_token_target]  # Stop at EOS token
    )

    return predicted_sentence

In [46]:
# Extract text data from training data
ara_texts = [ex['sourceString'] for ex in validation_set]  # Arabic inputs
eng_texts = [ex['targetString'] for ex in validation_set]  # Expected English outputs

print("--------EXAMPLES OF TRANSLATION--------")
for i in range(10):
    predicted_sentence = translate(ara_texts[i])  # Get model prediction
    print(f"Arabic Text: {ara_texts[i]}")
    print(f"Predicted English: {predicted_sentence}")  # Model's output
    print(f"Expected English: {eng_texts[i]}")  # Ground truth
    print("-" * 40)

--------EXAMPLES OF TRANSLATION--------
Arabic Text: عمرك رايح المكسيك؟
Predicted English: Where are you going to Mexico?
Expected English: Have you ever been to Mexico?
----------------------------------------
Arabic Text: فكرنا انه طبيعي لازم يتعاقب.
Predicted English: It thought that we should be here.
Expected English: We thought that it was natural that he should be punished.
----------------------------------------
Arabic Text: لازم تترك الامور تاخذ مجراها الطبيعي.
Predicted English: You must take things to things things things things things know.
Expected English: You must let things take their own course.
----------------------------------------
Arabic Text: لا يريدون استخ.
Predicted English: They don't want to use it.
Expected English: They don't want to use it.
----------------------------------------
Arabic Text: مقبلت تحجي شي اكثر عن الموضوع.
Predicted English: I would like to get something like I would like it.
Expected English: She declined to say more about it.
---------

In [43]:
# Save the trained tokenizers using their native method
source_tokenizer.save_to_file('source_tokenizer.subword')
target_tokenizer.save_to_file('target_tokenizer.subword')

In [44]:
transformer.save_weights("arabic_to_english_transformer_weights.weights.h5")