#Import Statement

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

#Data Preparation

##Download the data

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2024-04-24 14:30:07--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2024-04-24 14:30:08 (166 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


#Kaggle dataset

In [4]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

Downloading en-fr-translation-dataset.zip to /content
100% 2.54G/2.54G [00:30<00:00, 140MB/s]
100% 2.54G/2.54G [00:30<00:00, 89.9MB/s]


In [5]:
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

Archive:  /content/en-fr-translation-dataset.zip
  inflating: /content/dataset/en-fr.csv  


In [6]:
dataset = tf.data.experimental.CsvDataset(
  "/content/dataset/en-fr.csv",
  [
    tf.string,
    tf.string
  ],
)

#Data Processing

In [7]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

In [8]:
VOCAB_SIZE=20000
ENGLISH_SEQUENCE_LENGTH=64
FRENCH_SEQUENCE_LENGTH=64
EMBEDDING_DIM=300
BATCH_SIZE=64

In [9]:
english_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [10]:
french_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [11]:
def selector(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [12]:
split_dataset=text_dataset.map(selector)

In [13]:
def separator(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken'

In [14]:
init_dataset=text_dataset.map(separator)

In [15]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [16]:
english_training_data=init_dataset.map(lambda x,y:x)
english_vectorize_layer.adapt(english_training_data)

In [17]:
french_training_data=init_dataset.map(lambda x,y:y)
french_vectorize_layer.adapt(french_training_data)

In [18]:
def vectorizer(inputs,output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

In [19]:
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [20]:
dataset=split_dataset.map(vectorizer)

In [21]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [22]:
for i in dataset.take(1):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[104,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

In [23]:
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [24]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [25]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [26]:
NUM_BATCHES=int(200000/BATCH_SIZE)

In [27]:
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

In [28]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

#Modeling

##Encoder

In [29]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.units=units

  def build(self, input_shape):
    self.embedding = Embedding(self.vocab_size, self.embedding_dim)
    self.lstm = LSTM(self.units, return_sequences=True)

  def call(self, x):
     x = self.embedding(x)
     output = self.lstm(x)
     return output


The `Encoder` class is designed to implement an encoder component using an embedding layer followed by an LSTM layer. Here's a breakdown of the key components and functionality of your `Encoder` class:

1. **Initialization**:
   - In the `__init__` method, you initialize the `Encoder` class with parameters such as `vocab_size` (vocabulary size), `embedding_dim` (dimensionality of the embedding space), and `units` (number of units/neurons in the LSTM layer).

2. **Building Layers**:
   - In the `build` method, we define the layers that will be used in the encoder.
   - we create an `Embedding` layer with `self.vocab_size` as the input size and `self.embedding_dim` as the output dimension.
   - we create an `LSTM` layer with `self.units` as the number of units, configured to return sequences (`return_sequences=True`). This configuration is useful if the encoder is part of a sequence-to-sequence model, where the output sequences of the encoder are used by the decoder.

3. **Forward Pass (Call Method)**:
   - In the `call` method, we define the forward pass of the `Encoder`.
   - Given an input tensor `x`, which represents a batch of input sequences (where each sequence is represented as a sequence of integer tokens), you pass `x` through the `Embedding` layer to convert each token into its corresponding dense embedding representation.
   - The output of the `Embedding` layer (`x`) is then passed through the `LSTM` layer (`self.lstm`), which processes the sequence of embeddings and produces a sequence of hidden states (`output`).

4. **Output**:
   - The output of the `LSTM` layer (`output`) represents the encoded representation of the input sequence. If `return_sequences=True`, the output will be a sequence of hidden states corresponding to each timestep of the input sequence; otherwise, it will be the hidden state at the last timestep.

To use this `Encoder` class, we would create an instance of it, passing the required parameters (`vocab_size`, `embedding_dim`, `units`), and then call the instance with input data to obtain the encoded representations of input sequences. This encoded representation can be further used as input to a decoder or any subsequent layers in my sequence-to-sequence model.

In [30]:
HIDDEN_UNITS = 256
EMBEDDING_DIM = 256
encoder=Encoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS)
encoder_output=encoder(tf.zeros([128,8]))
print(encoder_output.shape)

(128, 8, 256)


In the above code, we are creating an instance of the `Encoder` class and passing a tensor of zeros as input to obtain the encoded output shape. Here's a breakdown of what each part of your code does:

1. **Initialization**:
   - We define constants `HIDDEN_UNITS` and `EMBEDDING_DIM` with values of 256.
   - We instantiate an `Encoder` object named `encoder` by passing `VOCAB_SIZE`, `EMBEDDING_DIM`, and `HIDDEN_UNITS` as parameters to the `Encoder` constructor.

2. **Calling the Encoder**:
   - We call the `encoder` object with a tensor of shape `[128, 8]` filled with zeros (`tf.zeros([128, 8])`).
   - This tensor represents a batch of input sequences, where:
     - `128` is the batch size (number of sequences in the batch).
     - `8` is the sequence length (number of tokens in each sequence).

3. **Output Shape**:
   - The output of calling the `encoder` with the zero tensor (`encoder(tf.zeros([128, 8]))`) will be the encoded representation of the input sequences.
   - The `print(encoder_output.shape)` statement displays the shape of the `encoder_output`, which corresponds to the shape of the encoded representation.

Given the parameters and input tensor used in your example, here's what happens:
- The input tensor `[128, 8]` represents a batch of 128 sequences, each with a length of 8 tokens.
- The `encoder` processes this input through its `call` method (defined in the `Encoder` class), which involves passing the input tensor through an `Embedding` layer followed by an `LSTM` layer.
- The resulting `encoder_output` will have a shape determined by the `LSTM` layer's configuration (`return_sequences=True` in this case), resulting in a tensor of shape `[128, 8, HIDDEN_UNITS]`, where `HIDDEN_UNITS` is 256 (the number of units specified in your `Encoder` initialization).

Therefore, `encoder_output.shape` will be `(128, 8, 256)`, indicating a batch size of 128, sequence length of 8, and each token represented by a vector of 256 dimensions in the encoded output. This encoded output can be used as input to subsequent layers or models in Our sequence-to-sequence architecture.

##Bahdanau Attention

In [33]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.units = units

  def build(self, input_shape):
    self.w_1 =tf.keras.layers.Dense(self.units)
    self.w_2 =tf.keras.layers.Dense(self.units)
    self.w =tf.keras.layers.Dense(1)

  def call(self, prev_dec_state, enc_states):
    scores =self.w(
        tf.nn.tanh(
            self.w_1(tf.expand_dims(prev_dec_state, -2)) +
            self.w_2(enc_states)
        )
    )

    attention_weights=tf.nn.softmax(scores, axis=1)
    context_vector = attention_weights*enc_states
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

The `BahdanauAttention` class defined is an implementation of the Bahdanau attention mechanism. This mechanism is commonly used in sequence-to-sequence models, especially in the context of neural machine translation.

Here's a breakdown of the components and functionality of this class:

- **Initialization (`__init__`)**:
  - The `__init__` method initializes the attention layer with a specified number of `units`, which determines the dimensionality of the attention mechanism.

- **Building the Layer (`build`)**:
  - The `build` method is called when the layer is first used in a model, allowing you to define the internal variables (`Dense` layers in this case) based on the input shapes.
  - Inside `build`, three `Dense` layers (`self.w_1`, `self.w_2`, and `self.w`) are created:
    - `self.w_1`: A dense layer used to transform the previous decoder state (`prev_dec_state`).
    - `self.w_2`: A dense layer used to transform the encoder states (`enc_states`).
    - `self.w`: A dense layer used to compute attention scores based on the combined transformed states.

- **Applying Attention (`call`)**:
  - The `call` method applies the Bahdanau attention mechanism:
    - It computes the attention scores (`scores`) by feeding the transformed previous decoder state (`prev_dec_state`) and the encoder states (`enc_states`) through the `self.w_1` and `self.w_2` layers and then applies a hyperbolic tangent (`tanh`) activation.
    - The resulting scores are passed through the `self.w` layer to produce attention weights (`attention_weights`) using a softmax activation along the sequence dimension (`axis=1`).
    - The context vector (`context_vector`) is computed by element-wise multiplication of the attention weights and the encoder states (`enc_states`), followed by summing along the sequence dimension (`axis=1`).

- **Output**:
  - The `call` method returns the `context_vector` and `attention_weights`, where:
    - `context_vector` represents the attended context information from the encoder states based on the attention weights.
    - `attention_weights` represents the normalized attention weights indicating the importance of each encoder state for the current decoding step.

This `BahdanauAttention` layer can be integrated into a sequence-to-sequence model, typically within a custom decoder or attention mechanism to enhance the model's ability to focus on relevant parts of the input sequence during decoding. This attention mechanism helps improve the performance of tasks like machine translation by enabling the model to selectively attend to different parts of the input sequence.

In [34]:
bahdanau_attention=BahdanauAttention(256)
context_vector,attention_weights=bahdanau_attention(tf.zeros([128,32]),tf.zeros([128,8,32]))
print(context_vector.shape)
print(attention_weights.shape)

(128, 32)
(128, 8, 1)


In the provided code snippet, we are using an instance of `BahdanauAttention` to compute attention weights and a context vector. Let's break down what happens in this code:


- **Instantiation**:
  - `bahdanau_attention = BahdanauAttention(256)`: Creates an instance of the `BahdanauAttention` class with `units=256`.

- **Applying Attention**:
  - `context_vector, attention_weights = bahdanau_attention(tf.zeros([128, 32]), tf.zeros([128, 8, 32]))`:
    - `tf.zeros([128, 32])`: Represents the previous decoder state (`prev_dec_state`). This tensor has a shape of `(batch_size, decoder_state_dim)`, where `batch_size=128` and `decoder_state_dim=32`.
    - `tf.zeros([128, 8, 32])`: Represents the encoder states (`enc_states`). This tensor has a shape of `(batch_size, sequence_length, encoder_state_dim)`, where `batch_size=128`, `sequence_length=8`, and `encoder_state_dim=32`.

- **Computing Attention**:
  - Inside the `call` method of `BahdanauAttention`, the provided `prev_dec_state` and `enc_states` are used to compute attention weights (`attention_weights`) and a context vector (`context_vector`).
  - The attention scores (`scores`) are computed using `self.w_1` and `self.w_2` layers, followed by applying a `tanh` activation and passing through the `self.w` layer to get the attention weights (`attention_weights`).
  - The context vector (`context_vector`) is computed by applying the attention weights to the encoder states (`enc_states`), resulting in a weighted sum along the sequence dimension.

- **Output Shapes**:
  - `print(context_vector.shape)`: Displays the shape of the computed `context_vector`. The shape will be `(batch_size, encoder_state_dim)`, which is `(128, 32)` based on the provided input shapes.
  - `print(attention_weights.shape)`: Displays the shape of the computed `attention_weights`. The shape will be `(batch_size, sequence_length, 1)`, which is `(128, 8, 1)` based on the provided input shapes.

In summary, the `BahdanauAttention` instance (`bahdanau_attention`) is used to compute attention weights and a context vector based on given previous decoder states and encoder states. The resulting `context_vector` represents the attended context information from the encoder states, and `attention_weights` represent the normalized attention weights indicating the importance of each encoder state for the current decoding step.

##Decoder

In [35]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
    super(Decoder, self).__init__()
    self.embedding_dim = embedding_dim
    self.vocab_size = vocab_size
    self.dec_units = dec_units
    self.sequence_length = sequence_length

  def build(self, input_shape):
    self.dense = Dense(self.vocab_size, activation='softmax')
    self.gru = GRU(
        self.dec_units, return_sequences=True, return_state=True
    )
    self.attention=BahdanauAttention(self.dec_units)
    self.embedding=Embedding(self.vocab_size,self.embedding_dim)

  def call(self, x, hidden, shifted_target):
    outputs = []
    context_vector = []
    attention_weights = []
    shifted_target=self.embedding(shifted_target)

    for t in range(0,self.sequence_length):

      context_vector,attention_weights=self.attention(hidden,x)
      dec_input=context_vector+shifted_target[:,t]
      output,hidden=self.gru(tf.expand_dims(dec_input,1))
      outputs.append(output[:,0])

    outputs=tf.convert_to_tensor(outputs)
    outputs=tf.transpose(outputs, perm=[1,0,2])

    outputs=self.dense(outputs)
    return outputs,attention_weights


The `Decoder` class provided is designed to decode input sequences using an attention mechanism. Let's break down this class and its `call` method:


- **Initialization**:
  - The `Decoder` class is initialized with parameters such as `vocab_size`, `embedding_dim`, `dec_units`, and `sequence_length`. These parameters define the architecture and behavior of the decoder.

- **Building the Model**:
  - In the `build` method, layers for the decoder are defined:
    - `self.dense`: A dense layer with softmax activation that outputs probabilities over the vocabulary.
    - `self.gru`: A GRU layer that operates in a sequence-to-sequence manner, returning sequences (`return_sequences=True`) and states (`return_state=True`).
    - `self.attention`: An instance of `BahdanauAttention`, which computes attention weights and context vectors.
    - `self.embedding`: An embedding layer that converts integer indices into dense vectors.

- **Decoding Process (`call` method)**:
  - During the `call` method:
    - `shifted_target` is embedded to get the target embeddings.
    - Iteratively compute context vectors and attention weights using `BahdanauAttention` for each timestep in the sequence.
    - Construct the decoder input (`dec_input`) by combining the context vector and the corresponding target embedding at each timestep.
    - Pass the `dec_input` through the GRU layer (`self.gru`) to get decoder outputs (`output`) and updated hidden states (`hidden`).
    - Store the outputs (`output[:, 0]`) at each timestep in the `outputs` list.
    - Convert the list of outputs into a tensor and transpose it to the correct shape (`[batch_size, sequence_length, vocab_size]`).
    - Apply the final dense layer (`self.dense`) to the output tensor to obtain logits representing the probability distribution over the vocabulary for each timestep.

In summary, the `Decoder` class defines a decoder model that utilizes an attention mechanism (`BahdanauAttention`) to decode input sequences into output sequences. The decoder processes input embeddings alongside context vectors derived from attention to generate output sequences with probabilities over the target vocabulary.

In [36]:
decoder=Decoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS,FRENCH_SEQUENCE_LENGTH)
outputs,attention_weights=decoder(encoder_output,tf.zeros([128,HIDDEN_UNITS]),tf.zeros([128,64]))
print(outputs.shape)
print(attention_weights.shape)

(128, 64, 20000)
(128, 8, 1)


We're initializing an instance of the `Decoder` class and using it to decode an `encoder_output` sequence. Let's analyze the provided code snippet:


In this snippet:

- `VOCAB_SIZE`: Size of the vocabulary used for decoding.
- `EMBEDDING_DIM`: Dimension of the embedding space.
- `HIDDEN_UNITS`: Number of units in the decoder's GRU layer.
- `FRENCH_SEQUENCE_LENGTH`: Length of the French sequence (output sequence length).

The `Decoder` instance (`decoder`) is initialized with the specified parameters.

- **Inputs to `decoder`**:
  - `encoder_output`: Output from the encoder, representing the encoded input sequence (shape `[128, 8, 256]` based on your previous example).
  - `tf.zeros([128, HIDDEN_UNITS])`: Initial hidden state for the GRU layer in the decoder. This is a zero tensor with shape `[128, HIDDEN_UNITS]`.
  - `tf.zeros([128, 64])`: Shifted target tensor for the decoder. This is a zero tensor representing the shifted target sequence, with shape `[128, 64]`.

- **Calling the `Decoder` instance**:
  - `outputs, attention_weights = decoder(encoder_output, tf.zeros([128, HIDDEN_UNITS]), tf.zeros([128, 64]))`: Calls the `decoder` instance with the `encoder_output`, initial hidden state, and shifted target.
  - The `Decoder` processes these inputs to produce `outputs` (logits representing the probability distribution over the vocabulary for each timestep) and `attention_weights` (attention weights computed during decoding).

- **Output shapes**:
  - `outputs.shape`: Shape of the decoder outputs, representing `[batch_size, sequence_length, vocab_size]` (e.g., `[128, 64, VOCAB_SIZE]` based on your setup).
  - `attention_weights.shape`: Shape of the attention weights computed during decoding, representing `[batch_size, sequence_length, 1]` (e.g., `[128, 64, 1]`).

This usage demonstrates how to use the `Decoder` class within a sequence-to-sequence model, where the decoder takes encoded inputs from the encoder along with initial states and target sequences to produce output logits and attention weights for each timestep in the output sequence.

In [37]:
### ENCODER
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")
encoder=Encoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS)
encoder_output=encoder(input)

### DECODER
shifted_target=Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decoder=Decoder(VOCAB_SIZE,EMBEDDING_DIM,HIDDEN_UNITS,FRENCH_SEQUENCE_LENGTH)
decoder_output,attention_weightss=decoder(encoder_output,tf.zeros([1,HIDDEN_UNITS]),shifted_target)

### OUTPUT
bahdanau=Model([input,shifted_target],decoder_output)
bahdanau.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 encoder_1 (Encoder)         (None, 64, 256)              5645312   ['input_1[0][0]']             
                                                                                                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 decoder_1 (Decoder)         ((None, 64, 20000),          1078659   ['encoder_1[0][0]',           
                              (None, 64, 1))              3          'input_2[0][0]']         

The code snippet provided sets up an encoder-decoder model with Bahdanau attention. Let's break down what each part does:

1. **Encoder Setup**:
   - **Input Layer**: Define an input layer (`input`) for English sequences with a specified sequence length (`ENGLISH_SEQUENCE_LENGTH`).
   - **Encoder Instance**: Initialize an instance of the `Encoder` class (`encoder`) with parameters `VOCAB_SIZE`, `EMBEDDING_DIM`, and `HIDDEN_UNITS`.
   - **Encoder Output**: Pass the input through the encoder (`encoder(input)`) to obtain the encoder output (`encoder_output`).

```python
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(input)
```

2. **Decoder Setup**:
   - **Shifted Target Input**: Define an input layer (`shifted_target`) for shifted French sequences with a specified sequence length (`FRENCH_SEQUENCE_LENGTH`).
   - **Decoder Instance**: Initialize an instance of the `Decoder` class (`decoder`) with parameters `VOCAB_SIZE`, `EMBEDDING_DIM`, `HIDDEN_UNITS`, and `FRENCH_SEQUENCE_LENGTH`.
   - **Decoder Output and Attention Weights**: Pass the encoder output, initial hidden state (`tf.zeros([1, HIDDEN_UNITS])`), and shifted target sequences to the decoder (`decoder(encoder_output, tf.zeros([1, HIDDEN_UNITS]), shifted_target)`) to obtain the decoder output (`decoder_output`) and attention weights (`attention_weights`).

```python
shifted_target = Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
decoder_output, attention_weights = decoder(encoder_output, tf.zeros([1, HIDDEN_UNITS]), shifted_target)
```

3. **Model Construction**:
   - **Bahdanau Attention Model**: Define a `Model` (`bahdanau`) that takes both the input (`input`) and shifted target (`shifted_target`) and outputs the decoder output (`decoder_output`).
   - **Model Summary**: Display the summary of the `bahdanau` model.

```python
bahdanau = Model([input, shifted_target], decoder_output)
bahdanau.summary()
```

In summary, this code sets up an encoder-decoder model with Bahdanau attention. The encoder processes English sequences, and the decoder uses Bahdanau attention to generate French sequences based on the encoded representations from the encoder. The `bahdanau` model represents the complete sequence-to-sequence architecture.

In [38]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.bleu_score=0

    def update_state(self,y_true,y_pred,sample_weight=None):
      y_pred=tf.argmax(y_pred,-1)
      self.bleu_score=0
      for i,j in zip(y_pred,y_true):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches=0
        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if word==j[q]:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break

        self.bleu_score+=total_matches/total_words

    def result(self):
        return self.bleu_score/BATCH_SIZE

In [39]:
bahdanau.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-4),
    metrics=[BLEU()],
    run_eagerly=True)

In [40]:
checkpoint_filepath = '/content/drive/MyDrive/NLP Repository/Projects/Neural_machine_translation/bahdanau_attention_1.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
history=bahdanau.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=30,
    # )
    callbacks=[model_checkpoint_callback])

Epoch 1/30
    137/Unknown - 2088s 15s/step - loss: 0.5509 - bleu: nan

KeyboardInterrupt: 

In [41]:
history=bahdanau.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=30,)
    #callbacks=[model_checkpoint_callback])

Epoch 1/30


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: not enough values to unpack (expected 2, got 0)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: not enough values to unpack (expected 2, got 0)
    391/Unknown - 6077s 15s/step - loss: 1.2041 - bleu: nan

KeyboardInterrupt: 

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
bahdanau.evaluate(val_dataset)

#Testing

In [None]:
index_to_word={x:y for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

In [None]:
def translator(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken'

  for i in range(FRENCH_SEQUENCE_LENGTH):
    tokenized_shifted_target=french_vectorize_layer([shifted_target])
    output=bahdanau.predict([tokenized_english_sentence,tokenized_shifted_target])
    french_word_index=tf.argmax(output,axis=-1)[0][i].numpy()
    current_word=index_to_word[french_word_index]
    if current_word=='endtoken':
      break
    shifted_target+=' '+current_word
  return shifted_target[11:]

In [None]:
translator('What makes you think that is not true?')

In [None]:
translator('Have you ever watched a soccer under the rain?')

In [None]:
translator('Great trees do not grow with ease, the stronger the winds, the stronger the trees')

In [None]:
translator('Everyone should water his or her tomato plants')

In [None]:
word_to_index={y:x for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}