In [2]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [19]:
import math
import random
import numpy as np
import os
import tensorflow
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Lambda, Embedding, LSTM, Dense, TimeDistributed, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy



# from Helper_Functions import *

In [5]:
import math
import numpy as np


def filter_by_length(max_length, min_length=0, length_keys=None, length_axis=0):
    assert max_length is not None or min_length is not None
    length_keys = length_keys or [0, 1]
    length_fn = lambda x: _length_fn(x, length_axis, length_keys)

    def filtered(gen):
        for example in gen:
            example_len = length_fn(example)

            # Checking max length boundary.
            if max_length is not None:
                if example_len > max_length:
                    continue
            # Checking min length boundary.
            if min_length is not None:
                if example_len < min_length:
                    continue
            # Within bounds.
            yield example

    return filtered


def _length_fn(example, length_axis, length_keys):
    """Length is the maximum of shape on length_axis over length_keys."""
    if isinstance(example, (list, tuple)):
        return max([example[i].shape[length_axis] for i in length_keys])
    return example.shape[length_axis]


def Bucket_By_Length(boundaries, batch_sizes,
                     length_keys=None, length_axis=0, strict_pad_on_len=False):
    """Returns a function for bucketing inputs, see `bucket_by_length`."""
    length_keys = length_keys or [0, 1]
    # In all cases so far, we use a length function of the following form.
    length_fn = lambda x: _length_fn(x, length_axis, length_keys)
    return lambda g: bucket_by_length(  # pylint: disable=g-long-lambda
        g, length_fn, boundaries, batch_sizes, strict_pad_on_len)


def bucket_by_length(generator, length_fn, boundaries, batch_sizes,
                     strict_pad_on_len=False):
    """Bucket by length, like tf.data.experimental.bucket_by_sequence_length.

  This function draws examples from the provided `generator` and puts an
  example into a bucket depending on `l = length_fn(example)`. Which bucket
  is used depends on between which `boundaries` is l. When a bucket reaches
  its batch size, as specified by `batch_sizes`, generates a batch of
  padded examples from this bucket.

  Args:
    generator: python generator to draw data from.
    length_fn: a function taking the example and returning the length.
    boundaries: a list of bucket boundaries.
    batch_sizes: a list of batch sizes.
    strict_pad_on_len: bool; if true we pad on the length dimension, dim[0]
      strictly as a multiple of boundary.

  Yields:
    An input batch, which comes from one of the buckets.
  """
    buckets = [[] for _ in range(len(batch_sizes))]
    boundaries = boundaries + [math.inf]  # Max boundary is unlimited.
    for example in generator:
        length = length_fn(example)
        # `bucket_idx` will always be < len(boundaries), since boundaries is right
        # padded by `math.inf`.
        bucket_idx = min([i for i, b in enumerate(boundaries) if length <= b])
        buckets[bucket_idx].append(example)
        if len(buckets[bucket_idx]) == batch_sizes[bucket_idx]:
            batched = zip(*buckets[bucket_idx])
            boundary = boundaries[bucket_idx]
            boundary = None if boundary == math.inf else boundary
            padded_batch = tuple(
                pad_to_max_dims(x, boundary, strict_pad_on_len) for x in batched)
            yield padded_batch
            buckets[bucket_idx] = []


def pad_to_max_dims(tensors, boundary=None, strict_pad_on_len=False):
    """Pad a tuple of tensors to a joint dimension and return their batch.

  For example, a pair of tensors of shape (2, 10) and (3, 9) will be padded
  to (3, 10) both and the returned tensor will have shape (2, 3, 10).

  When boundary is specified, we try to pad all unknown dimensions to boundary
  if possible, which can help reduce the number of different shapes occurring
  in the tensors and speed up XLA compilation. So, for example, a pair of
  tensors of shapes (8, 10), (8, 9) with boundary=12 will be padded to (8, 12).

  One special case occurs when boundary is much higher than the padding length
  that we'd use without boundary. For example, tensors (2, 10) and (3, 9) with
  boundary=12 could end up padded to (12, 12), but this is very wasteful in
  the first dimension. In that case, we will use the closest power-of-2 instead
  of the boundary, so the we will end up padding to (4, 12) instead of (12, 12).

  Args:
    tensors: a tuple or list of tensors to pad
    boundary: int or None; if given, expand the padded dimensions to this size
    strict_pad_on_len: bool; if true we pad on the length dimension, dim[0]
      strictly as a multiple of boundary.

  Returns:
    a tensor, the tensors padded together
  """
    # TODO(afrozm): Unify this later.
    if ((boundary is not None) and
            (strict_pad_on_len or isinstance(boundary, (list, tuple)))):
        ndim = tensors[0].ndim
        if not isinstance(boundary, (list, tuple)):
            boundary = [boundary] * ndim

        if ndim != len(boundary):
            raise ValueError(f'ndim != len(boundary) - '
                             f'ndim({ndim}) vs boundary({boundary}) '
                             f'len(boundary) = {len(boundary)}.')

        max_len_per_dim = [0] * ndim
        for tensor in tensors:
            max_len_per_dim = [
                max(e, s) for e, s in zip(tensor.shape, max_len_per_dim)]

        # Round everything up to a multiple of boundary in the respective dimension.
        len_per_dim = [
            max_len_per_dim[i] if not b else b * math.ceil(max_len_per_dim[i] / b)
            for i, b in enumerate(boundary)]

        padded_tensors = [
            np.pad(t, [(0, len_per_dim[i] - t.shape[i]) for i in range(ndim)],
                   mode='constant', constant_values=t.dtype.type(0))
            for t in tensors]

        return np.stack(padded_tensors)

    max_len_to_pad = []
    padding_needed = False
    dim = len(tensors[0].shape)
    for i in range(dim):
        max_len = max([t.shape[i] for t in tensors])
        min_len = min([t.shape[i] for t in tensors])
        if max_len == min_len and max_len == boundary:  # No padding needed.
            max_len_to_pad.append(max_len)
        elif boundary is None:
            max_len_to_pad.append(max_len)
            padding_needed = True
        else:
            padding_needed = True
            cur_boundary = max(max_len, boundary)
            if 2 * max_len < cur_boundary:
                cur_boundary = 2 ** int(np.ceil(np.log2(max_len)))
            max_len_to_pad.append(cur_boundary)
    if not padding_needed:
        return np.stack(tensors)
    padded_tensors = []
    for t in tensors:
        pad_widths = [(0, max_len_to_pad[i] - t.shape[i]) for i in range(dim)]
        padded_t = np.pad(t, pad_widths, mode='constant',
                          constant_values=t.dtype.type(0))
        padded_tensors.append(padded_t)
    return np.stack(padded_tensors)


def Add_Loss_Weights(id_to_mask=None):  # pylint: disable=invalid-name
    """Returns a function to add loss weights; see `add_loss_weights`."""
    return lambda g: add_loss_weights(g, id_to_mask=id_to_mask)


def add_loss_weights(generator, id_to_mask=None):
    """Add weights to inputs without weights and masks by id if requested.

  The generator stream is augmented in the following way:

  - If the stream consists of pairs `(inputs, targets)`, a loss mask is added
    that is creates as a tensor of ones of the same shape as targets.
  - If `id_to_mask` is not `None`, and the stream (after the previous point)
    has triples `(inputs, targets, weights)`, the weights are multiplied by a
    0/1 mask that is 0 iff targets is equal to `id_to_mask` (1 otherwise).

  Args:
    generator: Stream of tuples.
    id_to_mask: If not None, int-valued id that represents padding, as opposed
        to true target IDs.

  Yields:
    Examples from the augmented stream.
  """
    for example in generator:
        if len(example) > 3 or len(example) < 2:
            assert id_to_mask is None, 'Cannot automatically mask this stream.'
            yield example
        else:
            if len(example) == 2:
                weights = np.ones_like(example[1]).astype(np.float32)
            else:
                weights = example[2].astype(np.float32)
            mask = 1.0 - np.equal(example[1], id_to_mask).astype(np.float32)
            weights *= mask
            output = (example[0], example[1], weights)
            yield output


import tensorflow as tf


class ShiftRightLayer(tf.keras.layers.Layer):
    def __init__(self, n_positions=1, mode='train', **kwargs):
        super(ShiftRightLayer, self).__init__(**kwargs)
        self.n_positions = n_positions
        self.mode = mode

    def call(self, x):
        if self.mode == 'predict':
            return x
        else:
            # Calculate padding widths
            pad_widths = [[0, 0], [self.n_positions, 0],
                          [0, 0]]  # Assuming input shape [batch_size, seq_length, features]

            # Pad the input tensor with zeros
            padded = tf.pad(x, pad_widths, constant_values=0)

            # Slice to remove the added padding
            return padded[:, :-self.n_positions, :]


In [6]:
# Creating a Tokenizer class for converting the input to ouput

class TokenizedDataStream:
    def __init__(self, examples, vocab_size=10000, max_length=512):
        self.examples = examples
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.EOS = 1

        self.pt_lines, self.en_lines = self.extract_lines(examples)
        self.pt_tokenizer, self.en_tokenizer = self.create_tokenizers(self.pt_lines, self.en_lines)
        self.pt_sequences, self.en_sequences = self.convert_to_sequences(self.pt_lines, self.en_lines)

    def extract_lines(self, examples):
        pt_lines = [pt.numpy().decode('utf-8') for pt, _ in examples]
        en_lines = [en.numpy().decode('utf-8') for _, en in examples]
        return pt_lines, en_lines

    def create_tokenizers(self, pt_lines, en_lines):
        pt_tokenizer = self.create_tokenizer(pt_lines, self.vocab_size)
        en_tokenizer = self.create_tokenizer(en_lines, self.vocab_size)
        return pt_tokenizer, en_tokenizer

    def create_tokenizer(self, lines, vocab_size):
        tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
        tokenizer.fit_on_texts(lines)
        return tokenizer

    def convert_to_sequences(self, pt_lines, en_lines):
        pt_sequences = self.pt_tokenizer.texts_to_sequences(pt_lines)
        en_sequences = self.en_tokenizer.texts_to_sequences(en_lines)
        return pt_sequences, en_sequences

    def append_eos(self, inputs, targets):
        for input, target in zip(inputs, targets):
            # Append EOS to each sentence
            input_seq = list(input) + [self.EOS]
            target_seq = list(target) + [self.EOS]
            yield np.array(input_seq), np.array(target_seq)

    def get_tokenized_stream(self):
        pt_sequences, en_sequences = self.pt_sequences, self.en_sequences
        return self.append_eos(en_sequences, pt_sequences)

    def detokenize(self, integers, type):
        integers = list(np.squeeze(integers))

        EOS = 1

        if EOS in integers:
            integers = integers[:integers.index(EOS)]

        if type == "Input":
            # Convert integer sequences back to text using tokenizers
            return self.en_tokenizer.sequences_to_texts([integers])[0]

        if type == "Target":
            # Convert integer sequences back to text using tokenizers
            return self.pt_tokenizer.sequences_to_texts([integers])[0]


In [7]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

len(train_examples), len(val_examples)

(51785, 1193)

In [8]:
# Define the boundairs and batch_sizes for the bucketing
boundaries = [8, 16, 32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16, 8, 4, 2]

train_tokenizer = TokenizedDataStream(train_examples)
train_stream = train_tokenizer.get_tokenized_stream()

val_tokenizer = TokenizedDataStream(val_examples)
val_stream = val_tokenizer.get_tokenized_stream()

# Defining the vocab size
target_vocab_size = train_tokenizer.en_tokenizer.num_words
input_vocab_size = train_tokenizer.pt_tokenizer.num_words

# Creating the bucket -- it is the technique to group the data of similary length in one batch and process the whole data like this
train_batch_stream = Bucket_By_Length(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(train_stream)

eval_batch_stream = Bucket_By_Length(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(val_stream)


In [9]:
train_data_list = []
eval_data_list = []
# Load train data
for data_point in train_batch_stream:
    input_data,  target_data = data_point
    input_data = tf.convert_to_tensor(input_data, dtype=tf.float32)
    target_data = tf.convert_to_tensor(target_data, dtype=tf.float32)
    train_data_list.append((input_data, target_data))

# Load eval data
for data_point in eval_batch_stream:
    input_data, target_data = data_point
    input_data = tf.convert_to_tensor(input_data, dtype=tf.float32)
    target_data = tf.convert_to_tensor(target_data, dtype=tf.float32)
    eval_data_list.append((input_data, target_data))

In [10]:
def data_generator(data):
    for input_data, target_data in data:
        yield [input_data, target_data], target_data

In [11]:
# Out input encoder is defined like this  ---
# input_token -> embeddings -> token_embeddings -> LSTM*(how many encoder layers we want) -> It will return these encoder outputs

def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):

    """ Input encoder runs on the input sentence and creates
    activations that will be the keys and values for attention.

    Args:
        input_vocab_size: int: vocab size of the input
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
    Returns:
        tf.keras.Model: The input encoder
    """

    # Define input layer
    inputs = Input(shape=(None,))  # we are defining the None shape so that it can pass the variable length batch_Size

    # Create an embedding layer to convert tokens to vectors
    embedding = Embedding(input_dim=input_vocab_size, output_dim=d_model)(inputs)

    # Create a list of LSTM layers
    encoder_layers = [LSTM(units=d_model, return_sequences=True) for _ in range(n_encoder_layers)]

    # Apply LSTM layers in sequence
    activations = embedding
    for layer in encoder_layers:
        activations = layer(activations)

    # Create the model
    input_encoder = Model(inputs, activations)

    return input_encoder

In [12]:
# Now we want a pre-attention-decoder
# This will run on targets and creates a output that we will use as Queries in the attention layer.
# Target_token -> ShiftRight(for teacher forcing) -> <SOS>+targret token -> Embeddings -> token Embeddings -> LSTM(only single layer used) -> output
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    """ Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.

    Args:
        mode: str: 'train' or 'eval'
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
    Returns:
        tf.keras.Model: The pre-attention decoder
    """

    # Define input layer
    inputs = Input(shape=(None,))

    # Shift right to insert start-of-sentence token and implement teacher forcing during training
    # shifted_right = ShiftRightLayer(mode=mode,n_positions=1)(inputs) # This will shift the right to insert the start of sentence token and implement teacher forcing during training
    # model will be training or Inference -- it will behave different
    # TO-DO -- need to add the mode function.(Here)
    # Create a tensor with zeros of the same batch size and sequence length
    batch_size = tf.shape(inputs)[0]
    zero_padding = tf.zeros((batch_size,1), dtype = tf.float32)
    # print(zero_padding)
    shifted_right = tf.concat([zero_padding, inputs[:, :-1]], axis=1)

    # Create an embedding layer to convert tokens to vectors
    embedding = Embedding(input_dim=target_vocab_size, output_dim=d_model)(shifted_right)

    # Create an LSTM layer
    lstm = LSTM(units=d_model, return_sequences=True)(embedding)

    # Create the model
    pre_attention_decoder = Model(inputs, lstm)

    return pre_attention_decoder



In [13]:
# This function will prepare the inputs to the attention layer.
# We want to take in the encoder and pre-attention decoder activations and assign it to the queries, keys, and values
# TO-DO -- Need to add the mask token here so that ( the masked token will not have effect when calculating the softmax for probabilites)

def prepare_attention_input(encoder_activations, decoder_activations):
    """Prepare queries, keys, values, and mask for attention.

    Args:
        encoder_activations: tf.Tensor (batch_size, padded_input_length, d_model): output from the input encoder
        decoder_activations: tf.Tensor (batch_size, padded_input_length, d_model): output from the pre-attention decoder
        inputs: tf.Tensor (batch_size, padded_input_length): padded input tokens

    Returns:
        queries, keys, values, and mask for attention.
    """

    # Set the keys and values to the encoder activations
    keys = encoder_activations
    values = encoder_activations

    # Set the queries to the decoder activations
    queries = decoder_activations

    return queries, keys, values

In [25]:
def NMTAttn(input_vocab_size=1000,
            target_vocab_size=1000,
            d_model=1024,
            n_encoder_layers=4,
            n_decoder_layers=4,
            n_attention_heads=8,
            attention_dropout=0.2,
            mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
        input_vocab_size: int: vocab size of the input
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
        n_decoder_layers: int: number of LSTM layers in the decoder after attention
        n_attention_heads: int: number of attention heads
        attention_dropout: float, dropout for the attention layer
        mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
        A LSTM sequence-to-sequence model with attention.
    """

    # Step 0: Define input layers
    input_tokens = Input(shape=(None,))
    target_tokens = Input(shape=(None,))
    # masked_tokens = Input(shape=(None,))


    # Step 1: call the helper function to create layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    # Step 1: call the helper function to create layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)


    # Step 2: Copy input tokens and target tokens as they will be needed later
    concatenated_tokens = Concatenate(axis=1)([input_tokens, target_tokens]) ## Concate and stored in concated tokens.

    # Step 3: Run input encoder on the input and pre-attention decoder on the target
    input_encoder_output = input_encoder(input_tokens[:, :, tf.newaxis])  ## New-axis will just add the new dimension (B,T) now we will have (B,T,1)
    pre_attention_decoder_output = pre_attention_decoder(target_tokens[:, :, tf.newaxis])


    # Step 4: Prepare queries, keys, values, and mask for attention
    queries, keys, values = prepare_attention_input(input_encoder_output,
                                                    pre_attention_decoder_output)

    # Step 5: Run the AttentionQKV layer and nest it inside a Residual layer
    attention_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=n_attention_heads, key_dim=d_model, dropout=attention_dropout)

    attention_output = attention_layer(queries, keys, values) + queries  ## Will calculate the attention and add the residual connection(queries back to model)

    # TO-DO -- add the mask here
    # Step 6: Drop attention mask (i.e., index = None)
    # attention_output = attention_output[:, :, :, :d_model]
    # need to add then mask will be added

    # Step 7: Run the rest of the LSTM decoder ( This will be our final decoder which will take the activation_ouput and return the probabilites)
    lstm_layers = [LSTM(units=d_model, return_sequences=True) for _ in range(n_decoder_layers)]
    decoder_output = attention_output
    for layer in lstm_layers:
        decoder_output = layer(decoder_output)

    # Step 8: Prepare output by making it the right size
    output = Dense(target_vocab_size)(decoder_output) ## Added the dense layer

    # output = Dense(1)(output)

    # Step 9: Log-softmax for output
    output = tf.math.log(tf.nn.softmax(output, axis=1)) #Taking logSoftmax

    # output = tf.squeeze(output, axis=-1)

    # Create the final model
    model = Model(inputs=[input_tokens, target_tokens], outputs=output)  # Defining the final model

    return model

In [26]:
print(input_vocab_size, target_vocab_size)
model = NMTAttn(input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, d_model=1024, mode='train')
print(model.summary())

10000 10000
Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 tf.__operators__.getitem_7 (Sl  (None, None, 1)     0           ['input_6[0][0]']                
 icingOpLambda)                                                                                   
                                                                                                  
 tf.__operators__.getitem_6 (Sl  (None, None, 1)     0           ['input_5[0][0]

In [27]:
# Define the loss function
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
# Define the optimizer with the specified learning rate
optimizer = Adam(learning_rate=0.01)
# Define a metric for accuracy
accuracy_metric = SparseCategoricalAccuracy()

In [28]:
# Define the output directory
output_dir = 'output_dir/'

#Remove old model if it exists to restart training
import os

if os.path.exists(output_dir + 'model.h5'):
    os.remove(output_dir + 'model.h5')

In [29]:
model.compile(optimizer=optimizer, loss=loss_fn, metrics=[accuracy_metric])

In [30]:
num_epochs = 500
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_data_generator = data_generator(train_data_list)
    eval_data_generator = data_generator(eval_data_list)

    # # Training step
    history = model.fit(
        train_data_generator,
        epochs=1,  # One epoch at a time,
        verbose=1  # Set to 1 for progress updates
    )

    # Extract training loss and accuracy from the history object
    train_loss = history.history['loss'][0]
    train_accuracy = history.history['sparse_categorical_accuracy'][0]

    # Validation step
    eval_loss, eval_accuracy = model.evaluate(eval_data_generator, verbose=1)

    if epoch % 10 == 0 or epoch == num_epochs - 1:
        # Print training and validation metrics
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Validation Loss: {eval_loss:.4f}, Validation Accuracy: {eval_accuracy:.4f}")
    
        # Save the model checkpoint after each epoch
        model.save(output_dir + f"model_epoch_{epoch + 1}.h5")

Epoch 1/500
Train Loss: 5.3923, Train Accuracy: 0.3465
Validation Loss: 5.3364, Validation Accuracy: 0.3407
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Train Loss: nan, Train Accuracy: 0.3480
Validation Loss: nan, Validation Accuracy: 0.3407
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
     34/Unknown - 3s 80ms/step - loss: nan - sparse_categorical_accuracy: 0.3498

KeyboardInterrupt: 