In [1]:
# Implementing the assignment 1 in Keras

In [6]:
import math
import random
import numpy as np
import tensorflow
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from Helper_Functions import *

In [7]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.layers import Input, Lambda, Embedding, LSTM, Dense, TimeDistributed, Concatenate
from tensorflow.keras.models import Model

In [8]:
# Creating a Tokenizer class for converting the input to ouput

class TokenizedDataStream:
    def __init__(self, examples, vocab_size=10000, max_length=512):
        self.examples = examples
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.EOS = 1

        self.pt_lines, self.en_lines = self.extract_lines(examples)
        self.pt_tokenizer, self.en_tokenizer = self.create_tokenizers(self.pt_lines, self.en_lines)
        self.pt_sequences, self.en_sequences = self.convert_to_sequences(self.pt_lines, self.en_lines)

    def extract_lines(self, examples):
        pt_lines = [pt.numpy().decode('utf-8') for pt, _ in examples]
        en_lines = [en.numpy().decode('utf-8') for _, en in examples]
        return pt_lines, en_lines

    def create_tokenizers(self, pt_lines, en_lines):
        pt_tokenizer = self.create_tokenizer(pt_lines, self.vocab_size)
        en_tokenizer = self.create_tokenizer(en_lines, self.vocab_size)
        return pt_tokenizer, en_tokenizer

    def create_tokenizer(self, lines, vocab_size):
        tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
        tokenizer.fit_on_texts(lines)
        return tokenizer

    def convert_to_sequences(self, pt_lines, en_lines):
        pt_sequences = self.pt_tokenizer.texts_to_sequences(pt_lines)
        en_sequences = self.en_tokenizer.texts_to_sequences(en_lines)
        return pt_sequences, en_sequences

    def append_eos(self, inputs, targets):
        for input, target in zip(inputs, targets):
            # Append EOS to each sentence
            input_seq = list(input) + [self.EOS]
            target_seq = list(target) + [self.EOS]
            yield np.array(input_seq), np.array(target_seq)

    def get_tokenized_stream(self):
        pt_sequences, en_sequences = self.pt_sequences, self.en_sequences
        return self.append_eos(en_sequences, pt_sequences)

    def detokenize(self, integers, type):
        integers = list(np.squeeze(integers))

        EOS = 1

        if EOS in integers:
            integers = integers[:integers.index(EOS)]

        if type == "Input":
            # Convert integer sequences back to text using tokenizers
            return self.en_tokenizer.sequences_to_texts([integers])[0]

        if type == "Target":
            # Convert integer sequences back to text using tokenizers
            return self.pt_tokenizer.sequences_to_texts([integers])[0]


In [9]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

In [10]:
train_examples, val_examples = examples['train'], examples['validation']

In [11]:
len(train_examples), len(val_examples)

(51785, 1193)

In [12]:
# Define the boundairs and batch_sizes for the bucketing
boundaries = [8, 16, 32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16, 8, 4, 2]

train_tokenizer = TokenizedDataStream(train_examples)
train_stream = train_tokenizer.get_tokenized_stream()

val_tokenizer = TokenizedDataStream(val_examples)
val_stream = val_tokenizer.get_tokenized_stream()

# Defining the vocab size
target_vocab_size = train_tokenizer.en_tokenizer.num_words
input_vocab_size = train_tokenizer.pt_tokenizer.num_words

# Creating the bucket -- it is the technique to group the data of similary length in one batch and process the whole data like this
train_batch_stream = Bucket_By_Length(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(train_stream)

eval_batch_stream = Bucket_By_Length(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(val_stream)


In [13]:
train_data_list = []
eval_data_list = []

In [14]:
# Load train data
for data_point in train_batch_stream:
    input_data,  target_data = data_point
    input_data = tf.convert_to_tensor(input_data, dtype=tf.int32)
    target_data = tf.convert_to_tensor(target_data, dtype=tf.int32)
    train_data_list.append((input_data, target_data))

# Load eval data
for data_point in eval_batch_stream:
    input_data, target_data = data_point
    input_data = tf.convert_to_tensor(input_data, dtype=tf.int32)
    target_data = tf.convert_to_tensor(target_data, dtype=tf.int32)
    eval_data_list.append((input_data, target_data))

In [15]:
def data_generator(data):
    for input_data, target_data in data:
        yield [input_data, target_data], target_data

In [16]:
# Let now define the Modules of our model. We are using the functional Api of keras. 

In [17]:
# Out input encoder is defined like this  --- 
# input_token -> embeddings -> token_embeddings -> LSTM*(how many encoder layers we want) -> It will return these encoder outputs

def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):

    """ Input encoder runs on the input sentence and creates
    activations that will be the keys and values for attention.
    
    Args:
        input_vocab_size: int: vocab size of the input
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
    Returns:
        tf.keras.Model: The input encoder
    """

    # Define input layer
    inputs = Input(shape=(None,))  # we are defining the None shape so that it can pass the variable length batch_Size

    # Create an embedding layer to convert tokens to vectors
    embedding = Embedding(input_dim=input_vocab_size, output_dim=d_model)(inputs)

    # Create a list of LSTM layers
    encoder_layers = [LSTM(units=d_model, return_sequences=True) for _ in range(n_encoder_layers)]

    # Apply LSTM layers in sequence
    activations = embedding
    for layer in encoder_layers:
        activations = layer(activations)

    # Create the model
    input_encoder = Model(inputs, activations)

    return input_encoder

In [129]:
# Now we want a pre-attention-decoder 
# This will run on targets and creates a output that we will use as Queries in the attention layer.
# Target_token -> ShiftRight(for teacher forcing) -> <SOS>+targret token -> Embeddings -> token Embeddings -> LSTM(only single layer used) -> output
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    """ Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.

    Args:
        mode: str: 'train' or 'eval'
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
    Returns:
        tf.keras.Model: The pre-attention decoder
    """

    # Define input layer
    inputs = Input(shape=(None,))

    # Shift right to insert start-of-sentence token and implement teacher forcing during training
    # shifted_right = ShiftRightLayer(mode=mode,n_positions=1)(inputs) # This will shift the right to insert the start of sentence token and implement teacher forcing during training
    # model will be training or Inference -- it will behave different
    # TO-DO -- need to add the mode function.(Here)
    # Value to append in front
    value_to_append = tf.constant(0, dtype=tf.int32)
    
    # Create a tensor with zeros of the same batch size and sequence length
    batch_size, sequence_length, _ = target.shape
    zero_padding = tf.zeros((batch_size,1, 1), dtype=tf.int32)
    # print(zero_padding)
    shifted_right = tf.concat([zero_padding, target], axis=1)

    # Create an embedding layer to convert tokens to vectors
    embedding = Embedding(input_dim=target_vocab_size, output_dim=d_model)(shifted_right)

    # Create an LSTM layer
    lstm = LSTM(units=d_model, return_sequences=True)(embedding)

    # Create the model
    pre_attention_decoder = Model(inputs, lstm)

    return pre_attention_decoder



In [130]:
# This function will prepare the inputs to the attention layer. 
# We want to take in the encoder and pre-attention decoder activations and assign it to the queries, keys, and values
# TO-DO -- Need to add the mask token here so that ( the masked token will not have effect when calculating the softmax for probabilites)

def prepare_attention_input(encoder_activations, decoder_activations):
    """Prepare queries, keys, values, and mask for attention.

    Args:
        encoder_activations: tf.Tensor (batch_size, padded_input_length, d_model): output from the input encoder
        decoder_activations: tf.Tensor (batch_size, padded_input_length, d_model): output from the pre-attention decoder
        inputs: tf.Tensor (batch_size, padded_input_length): padded input tokens

    Returns:
        queries, keys, values, and mask for attention.
    """

    # Set the keys and values to the encoder activations
    keys = encoder_activations
    values = encoder_activations

    # Set the queries to the decoder activations
    queries = decoder_activations

    return queries, keys, values

In [137]:
from layers.input_encoder import input_encoder_fn
from layers.pre_attention_inputs import pre_attention_decoder_fn
def NMTAttn(input_vocab_size=1000,
            target_vocab_size=1000,
            d_model=1024,
            n_encoder_layers=1,
            n_decoder_layers=1,
            n_attention_heads=1,
            attention_dropout=0.0,
            mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
        input_vocab_size: int: vocab size of the input
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
        n_decoder_layers: int: number of LSTM layers in the decoder after attention
        n_attention_heads: int: number of attention heads
        attention_dropout: float, dropout for the attention layer
        mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
        A LSTM sequence-to-sequence model with attention.
    """

    # Step 0: Define input layers
    input_tokens = Input(shape=(None,))
    target_tokens = Input(shape=(None,))
    # masked_tokens = Input(shape=(None,))

    
    # Step 1: call the helper function to create layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    # Step 1: call the helper function to create layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    
    # Step 2: Copy input tokens and target tokens as they will be needed later
    concatenated_tokens = Concatenate(axis=1)([input_tokens, target_tokens]) ## Concate and stored in concated tokens.

    # Step 3: Run input encoder on the input and pre-attention decoder on the target
    input_encoder_output = input_encoder(input_tokens[:, :, tf.newaxis])  ## New-axis will just add the new dimension (B,T) now we will have (B,T,1)
    pre_attention_decoder_output = pre_attention_decoder(target_tokens[:, :, tf.newaxis])

    
    # Step 4: Prepare queries, keys, values, and mask for attention
    queries, keys, values = prepare_attention_input(input_encoder_output,
                                                    pre_attention_decoder_output)

    # Step 5: Run the AttentionQKV layer and nest it inside a Residual layer
    attention_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=n_attention_heads, key_dim=d_model, dropout=attention_dropout)
    
    attention_output = attention_layer(queries, keys, values) + queries  ## Will calculate the attention and add the residual connection(queries back to model)

    # TO-DO -- add the mask here 
    # Step 6: Drop attention mask (i.e., index = None)
    # attention_output = attention_output[:, :, :, :d_model]
    # need to add then mask will be added

    # Step 7: Run the rest of the LSTM decoder ( This will be our final decoder which will take the activation_ouput and return the probabilites)
    lstm_layers = [LSTM(units=d_model, return_sequences=True) for _ in range(n_decoder_layers)]
    decoder_output = attention_output
    for layer in lstm_layers:
        decoder_output = layer(decoder_output)

    # Step 8: Prepare output by making it the right size  
    output = Dense(target_vocab_size)(decoder_output) ## Added the dense layer

    # Step 9: Log-softmax for output
    output = tf.math.log(tf.nn.softmax(output)) #Taking logSoftmax

    # Create the final model
    model = Model(inputs=[input_tokens, target_tokens], outputs=output)  # Defining the final model 

    return model

In [143]:
print(input_vocab_size, target_vocab_size)
model = NMTAttn(input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, d_model=1024, mode='train')
print(model.summary())

10000 10000
Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_26 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_25 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 tf.__operators__.getitem_13 (S  (None, None, 1)     0           ['input_26[0][0]']               
 licingOpLambda)                                                                                  
                                                                                                  
 tf.__operators__.getitem_12 (S  (None, None, 1)     0           ['input_25[0][

In [144]:
# Define the loss function
loss_fn = SparseCategoricalCrossentropy()
# Define the optimizer with the specified learning rate
optimizer = Adam(learning_rate=0.01)
# Define a metric for accuracy
accuracy_metric = SparseCategoricalAccuracy()

In [145]:
# Define the output directory
output_dir = 'output_dir/'

#Remove old model if it exists to restart training
import os

if os.path.exists(output_dir + 'model.h5'):
    os.remove(output_dir + 'model.h5')

In [146]:
## Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=[accuracy_metric])

In [147]:
num_epochs = 100
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_data_generator = data_generator(train_data_list[:20])
    eval_data_generator = data_generator(eval_data_list[:20])

    # # Training step
    history = model.fit(
        train_data_generator,
        epochs=1,  # One epoch at a time,
        verbose=1  # Set to 1 for progress updates
    )
    
    # Extract training loss and accuracy from the history object
    train_loss = history.history['loss'][0]
    train_accuracy = history.history['sparse_categorical_accuracy'][0]
    
    # Validation step
    eval_loss, eval_accuracy = model.evaluate(eval_data_generator, verbose=1)
    
    # Print training and validation metrics
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {eval_loss:.4f}, Validation Accuracy: {eval_accuracy:.4f}")
    
    # Save the model checkpoint after each epoch
    model.save(output_dir + f"model_epoch_{epoch + 1}.h5")

Epoch 1/100
((<tf.Tensor 'IteratorGetNext:0' shape=(None, None) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, None) dtype=int32>), <tf.Tensor 'IteratorGetNext:2' shape=(None, None) dtype=int32>)
(<tf.Tensor 'IteratorGetNext:0' shape=(None, None) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, None) dtype=int32>) Tensor("IteratorGetNext:2", shape=(None, None), dtype=int32) None
((<tf.Tensor 'IteratorGetNext:0' shape=(None, None) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, None) dtype=int32>), <tf.Tensor 'IteratorGetNext:2' shape=(None, None) dtype=int32>)
(<tf.Tensor 'IteratorGetNext:0' shape=(None, None) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, None) dtype=int32>) Tensor("IteratorGetNext:2", shape=(None, None), dtype=int32) None
Train Loss: 9.2103, Train Accuracy: 0.0001
Validation Loss: 9.2103, Validation Accuracy: 0.0000
Epoch 2/100
Train Loss: 9.2103, Train Accuracy: 0.0001
Validation Loss: 9.2103, Validation Accuracy: 0.0000
Epoc