#Import statements

In [None]:
!pip install --upgrade tensorflow

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,LayerNormalization,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input,MultiHeadAttention,Embedding,TextVectorization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

#Dataset Preparation

##Download the data

In [None]:
!wget https://www.manythings.org/anki/fra-eng.zip

In [None]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

##Preprocessing of the data

In [None]:
VOCAB_SIZE=20000
ENGLISH_SEQUENCE_LENGTH=32
FRENCH_SEQUENCE_LENGTH=32
EMBEDDING_DIM=256
BATCH_SIZE=128

In [None]:
english_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [None]:
french_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [None]:
def selector(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [None]:
split_dataset=text_dataset.map(selector)

In [None]:
def separator(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken'

In [None]:
init_dataset=text_dataset.map(separator)

In [None]:
for i in split_dataset.take(3):
  print(i)

In [None]:
english_training_data=init_dataset.map(lambda x,y:x)### input x,y and output x
english_vectorize_layer.adapt(english_training_data)#### adapt the vectorize_layer to the training data

In [None]:
french_training_data=init_dataset.map(lambda x,y:y)### input x,y and output y
french_vectorize_layer.adapt(french_training_data)#### adapt the vectorize_layer to the training data

In [None]:
def vectorizer(inputs,output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

In [None]:
split_dataset

In [None]:
dataset=split_dataset.map(vectorizer)

In [None]:
for i in split_dataset.take(3):
  print(i)

In [None]:
for i in dataset.take(1):
  print(i)

In [None]:
dataset

In [None]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
dataset

In [None]:
NUM_BATCHES=int(200000/BATCH_SIZE)

In [None]:
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

In [None]:
train_dataset

#Modeling

##Embeding

In [None]:
def positional_encoding(model_size,SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros((model_size))
    for i in range(model_size):
      if i%2==0:
        PE[i]=np.sin(pos/(10000**(i/model_size)))
      else:
        PE[i]=np.cos(pos/(10000**((i-1)/model_size)))
    output.append(tf.expand_dims(PE,axis=0))
  out=tf.concat(output,axis=0)
  out=tf.expand_dims(out,axis=0)
  return tf.cast(out,dtype=tf.float32)

Let's explain the purpose and implementation of the positional encoding function step by step.

**Purpose of Positional Encoding:**

In sequence processing tasks like natural language processing (NLP) or machine translation, Transformer-based models don't have inherent notions of word order because they process tokens in parallel. To provide positional information to the model, positional encoding is added to the input embeddings. This helps the model differentiate between tokens based on their position in the sequence.

**Implementation of Positional Encoding:**

1. **Initialization and Setup:**
   ```python
   import tensorflow as tf
   import numpy as np
   ```

2. **Function Definition:**
   ```python
   def positional_encoding(model_size, sequence_length):
       position = np.arange(sequence_length)
       model_dims = np.arange(model_size)
       position_enc = np.zeros((sequence_length, model_size))
   ```

   Here:
   - `position`: Array of integers from `0` to `sequence_length - 1`, representing the positions in the sequence.
   - `model_dims`: Array of integers from `0` to `model_size - 1`, representing the dimensions of the positional encoding.
   - `position_enc`: Initialized as a matrix of zeros with shape `(sequence_length, model_size)`.

3. **Calculating Positional Encodings:**
   ```python
       for pos in position:
           for i in model_dims:
               if i % 2 == 0:
                   position_enc[pos, i] = np.sin(pos / (10000 ** (i / model_size)))
               else:
                   position_enc[pos, i] = np.cos(pos / (10000 ** ((i - 1) / model_size))))
   ```

   This nested loop computes the sine and cosine values for each position (`pos`) and dimension (`i`) in the positional encoding matrix using the formula:
   - For even indices (`i % 2 == 0`): \( \text{positional\_enc}[pos, i] = \sin(\text{pos} / 10000^{(i / \text{model\_size})}) \)
   - For odd indices (`i % 2 != 0`): \( \text{positional\_enc}[pos, i] = \cos(\text{pos} / 10000^{((i - 1) / \text{model\_size})}) \)

   The use of sine and cosine functions with varying frequencies ensures that each dimension of the positional encoding captures a unique position-related pattern.

4. **Tensor Conversion and Reshaping:**
   ```python
       position_enc = tf.convert_to_tensor(position_enc, dtype=tf.float32)
       position_enc = tf.expand_dims(position_enc, axis=0)
       return position_enc
   ```

   - `tf.convert_to_tensor`: Converts the positional encoding matrix (`position_enc`) from NumPy array to a TensorFlow tensor.
   - `tf.expand_dims`: Adds a batch dimension (`axis=0`) to the positional encoding tensor. This is typically required to align the shape with the input data expected by Transformer-based models.

**Usage:**

Once the positional encoding function is defined, you can use it to generate positional encodings for sequences of a specific length (`sequence_length`) and model dimension (`model_size`). The resulting positional encodings can then be added to input embeddings before feeding them into the Transformer model.

For example:
```python
model_size = 128
sequence_length = 10

pos_encodings = positional_encoding(model_size, sequence_length)
print(pos_encodings.shape)  # Output: (1, sequence_length, model_size)
```

This `pos_encodings` tensor can be concatenated or added to input embeddings in your Transformer-based model to incorporate positional information into the model's input representation, enabling it to learn dependencies based on the order of tokens in the sequence.

In [None]:
print(positional_encoding(256,64).shape)

The shape of the positional encodings generated by `positional_encoding(256, 64)` would be `(1, 64, 256)`.

Here's why:

- The `positional_encoding` function generates positional encodings for a sequence length of `64` with each encoding having a dimensionality of `256`.
- Inside the function, the encodings are concatenated along the batch dimension and then expanded to add a batch dimension of size `1`.
- Therefore, the resulting shape of the tensor is `(1, 64, 256)`:

  - `1` corresponds to the batch dimension.
  - `64` corresponds to the sequence length.
  - `256` corresponds to the dimensionality of each positional encoding vector.

This shape represents a batch of positional encodings for a sequence of length `64`, each encoded with a vector of size `256`. Each position in the sequence has a unique positional encoding represented by the `256`-dimensional vector.

In [None]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embed_dim,):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(
        input_dim=vocab_size, output_dim=embed_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim

  def call(self, inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions=positional_encoding(
        self.embed_dim,self.sequence_length)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)

The `Embeddings` class you've defined is used to create embeddings for tokens in a sequence along with positional encodings. Here's an explanation of the key components:

- **Initialization (`__init__`)**:
  - `sequence_length`: Length of the input sequence.
  - `vocab_size`: Size of the vocabulary (number of unique tokens).
  - `embed_dim`: Dimensionality of the token embeddings.

  In the `__init__` method:
  - `token_embeddings`: This is an `Embedding` layer that maps each token (represented as an integer index) to a dense vector of size `embed_dim`.
  - `sequence_length`, `vocab_size`, and `embed_dim` are stored as attributes of the class.

- **Call (`call`)**:
  The `call` method is used to perform the forward pass of the layer.
  - `inputs`: Represents the input sequence (a tensor of shape `[batch_size, sequence_length]` containing integer token indices).
  - `embedded_tokens`: Applies the `token_embeddings` layer to the input `inputs`, resulting in a tensor of shape `[batch_size, sequence_length, embed_dim]` where each token is replaced by its corresponding dense embedding vector.
  - `embedded_positions`: Calls the `positional_encoding` function to generate positional encodings for the sequence with the specified `embed_dim` and `sequence_length`.
  - The final result returned by `call` is the sum of `embedded_tokens` and `embedded_positions`, effectively combining token embeddings with their respective positional encodings.

- **Compute Mask (`compute_mask`)**:
  The `compute_mask` method is used to generate a mask tensor based on the input `inputs`.
  - It returns a mask tensor that is `True` for positions where the input tokens are non-zero (indicating valid tokens) and `False` for positions where the input tokens are zero (indicating padding).
  - This mask is typically used in subsequent layers (e.g., in attention mechanisms or masking layers) to ignore padded positions during computation.

Overall, this `Embeddings` class encapsulates the process of generating token embeddings along with positional encodings, which is a common technique used in sequence-based models like transformers for natural language processing tasks. The positional encodings help the model distinguish between tokens based on their position in the sequence, providing important positional information to the model.

In [None]:
test_input=tf.constant([[2,4,7,21,3,5,0,0]])
emb=Embeddings(8,20000,512)
emb_out=emb(test_input)
print(emb_out.shape)

The `Embeddings` layer defined takes an input tensor of token indices and produces embeddings enhanced with positional encodings. Let's break down how your code works:

1. **Initialization**:
   - `Embeddings` is initialized with:
     - `sequence_length`: `8`, which represents the length of the input sequence.
     - `vocab_size`: `20000`, indicating the size of the vocabulary (number of unique tokens).
     - `embed_dim`: `512`, specifying the dimensionality of the embedding vectors.

2. **Call (`call` method)**:
   - When you pass `test_input` (`[[2,4,7,21,3,5,0,0]]`) through the `Embeddings` layer (`emb`), the `call` method is invoked.
   - `token_embeddings` (`self.token_embeddings`) is an `Embedding` layer initialized with `vocab_size=20000` and `output_dim=512`. It converts each token index in `test_input` to its corresponding dense embedding vector.
   - `embedded_positions` is generated using the `positional_encoding` function, which creates positional encodings based on the `embed_dim` (512) and `sequence_length` (8) parameters.
   - The `embedded_tokens` tensor obtained from the `token_embeddings` layer has shape `[1, 8, 512]` (batch size of 1, sequence length of 8, and embedding dimension of 512).
   - The `embedded_positions` tensor also has shape `[1, 8, 512]`.
   - Finally, the `call` method returns `embedded_tokens + embedded_positions`, resulting in a tensor `emb_out` of shape `[1, 8, 512]`.

3. **Output**:
   - `emb_out` represents the processed embeddings for the input sequence `[[2,4,7,21,3,5,0,0]]`, where each token is enhanced with positional encoding.
   - The shape of `emb_out` is `[1, 8, 512]`, indicating a batch size of 1, sequence length of 8, and embedding dimension of 512.

In summary, the `Embeddings` layer combines token embeddings (derived from the `Embedding` layer) with positional encodings (generated by the `positional_encoding` function) to produce enriched embeddings for input sequences, which can be used as inputs to various sequence-based models in natural language processing tasks.

In [None]:
mask = emb.compute_mask(test_input)
print(mask)


padding_mask = tf.cast(
    tf.repeat(mask,repeats=tf.shape(mask)[1],axis=0),
    dtype=tf.int32)
print(padding_mask)


1. **Computing Mask**:
   ```python
   mask = emb.compute_mask(test_input)
   print(mask)
   ```

   This code snippet computes a mask using the `compute_mask` method of the `Embeddings` instance `emb` with `test_input` as the input.

   In the `Embeddings` class, the `compute_mask` method is defined as follows:
   ```python
   def compute_mask(self, inputs, mask=None):
       return tf.math.not_equal(inputs, 0)
   ```

   - `inputs`: The input tensor containing token indices (`test_input`).
   - The `compute_mask` method creates a boolean mask where `True` indicates valid tokens (non-padding), and `False` indicates padding tokens (tokens with value `0`).

   Therefore, `mask` will be a boolean tensor with the same shape as `test_input` that indicates which tokens are not padding (`True`) and which are padding (`False`).

2. **Creating Padding Mask**:
   ```python
   padding_mask = tf.cast(
       tf.repeat(mask, repeats=tf.shape(mask)[1], axis=0),
       dtype=tf.int32)
   print(padding_mask)
   ```

   This code snippet creates a padding mask based on the computed `mask` using `tf.repeat`.

   - `tf.shape(mask)[1]` retrieves the sequence length from the `mask`.
   - `tf.repeat(mask, repeats=tf.shape(mask)[1], axis=0)` repeats each element of `mask` along the axis `0` (batch dimension) to match the sequence length.
   - `tf.cast(..., dtype=tf.int32)` casts the boolean mask into integers (0s and 1s), where `1` represents valid tokens and `0` represents padding tokens.

   The resulting `padding_mask` tensor will have the same shape as `test_input`, with the padding tokens (`0`s) replicated along the batch dimension to create a mask suitable for masking sequences during model training or inference.

The use of padding masks is common in sequence processing tasks, particularly in Transformer-based models, to mask out padding tokens during computation. This helps in handling variable-length sequences efficiently and accurately within the model.

In [None]:
print(tf.linalg.band_part(
        tf.ones([1,8, 8],dtype=tf.int32),-1,0))

The code snippet provided uses `tf.linalg.band_part` to create a mask matrix that masks out (sets to zero) elements above the main diagonal of a square matrix. Let's break down the usage and purpose of this function call:

```python
print(tf.linalg.band_part(tf.ones([1, 8, 8], dtype=tf.int32), -1, 0))
```

Here's what each part of this function call does:

- `tf.ones([1, 8, 8], dtype=tf.int32)`: This creates a 3D tensor (matrix) filled with ones. The shape of this tensor is `[1, 8, 8]`, meaning it's a batch of one matrix where each matrix is 8x8 filled with ones. The `dtype=tf.int32` specifies the data type of the tensor as integers.

- `tf.linalg.band_part(...)`: This function is used to mask out elements of a matrix based on the specified upper and lower bands.

  - `tf.linalg.band_part(matrix, num_lower, num_upper)`:
    - `matrix`: The input matrix or tensor.
    - `num_lower`: The number of subdiagonals below the main diagonal to keep (inclusive of the diagonal). Here, `-1` means keep all subdiagonals below the main diagonal.
    - `num_upper`: The number of superdiagonals above the main diagonal to keep. Here, `0` means mask out all elements above the main diagonal.

In the provided code, the `tf.ones([1, 8, 8], dtype=tf.int32)` creates an 8x8 matrix filled with ones, and then `tf.linalg.band_part(..., -1, 0)` is applied to this matrix. This function call keeps all elements below the main diagonal (inclusive) and masks out all elements above the main diagonal.

The resulting output will be a 3D tensor (matrix) of shape `[1, 8, 8]` where:
- The main diagonal and all elements below it are set to `1` (not masked).
- All elements above the main diagonal are set to `0` (masked).

This type of matrix is commonly used as an attention mask in Transformer models, where it helps define which positions in the input sequence should be attended to during computation and which positions should be ignored (typically padding or future tokens in the case of self-attention).

The printed output will show the resulting masked matrix, visualizing how the `tf.linalg.band_part` function applies masking to the input matrix based on the specified `num_lower` and `num_upper` parameters.

##Custom MultiHeadAttention


In [None]:
class CustomSelfAttention(Layer):
  def __init__(self,model_size):
    super(CustomSelfAttention,self).__init__()
    self.model_size=model_size
  def call(self,query,key,value,masking):
    ######## compute scores
    score=tf.matmul(query,key,transpose_b=True)
    ######## scaling
    score/=tf.math.sqrt(tf.cast(self.model_size,tf.float32))
    ######## masking
    masking=tf.cast(masking,dtype=tf.float32)
    score+=(1.-masking)*-1e10
    ######## attention_weights
    attention=tf.nn.softmax(score,axis=-1)*masking
    ######## output
    head=tf.matmul(attention,value)
    return head

This `CustomSelfAttention` class is designed to implement a self-attention mechanism, commonly used in Transformer models for processing sequences. Let's break down the functionality of this class and how it computes self-attention.

### Class Initialization:
```python
class CustomSelfAttention(Layer):
    def __init__(self, model_size):
        super(CustomSelfAttention, self).__init__()
        self.model_size = model_size
```
- `model_size`: The dimensionality of the model. This parameter determines the size of the query, key, and value vectors.

### `call` Method:
```python
def call(self, query, key, value, masking):
    # Compute scores
    score = tf.matmul(query, key, transpose_b=True)

    # Scaling
    score /= tf.math.sqrt(tf.cast(self.model_size, tf.float32))

    # Masking
    masking = tf.cast(masking, dtype=tf.float32)
    score += (1. - masking) * -1e10

    # Attention weights
    attention = tf.nn.softmax(score, axis=-1) * masking

    # Output
    head = tf.matmul(attention, value)
    return head
```

### Breakdown of `call` Method:
1. **Compute Scores**:
   ```python
   score = tf.matmul(query, key, transpose_b=True)
   ```
   - Computes the dot product of `query` and `key` matrices. The result is a tensor of shape `(batch_size, num_queries, num_keys)`.

2. **Scaling**:
   ```python
   score /= tf.math.sqrt(tf.cast(self.model_size, tf.float32))
   ```
   - Scales the `score` matrix by dividing by the square root of `self.model_size`. This scaling helps stabilize the gradients during training.

3. **Masking**:
   ```python
   masking = tf.cast(masking, dtype=tf.float32)
   score += (1. - masking) * -1e10
   ```
   - `masking` is a binary mask where `0` indicates positions to be masked (ignored) and `1` indicates valid positions.
   - Adds a large negative value (`-1e10`) to the positions indicated by `masking` that should be masked out (ignored) during softmax computation. This effectively sets the attention scores for masked positions to `-inf` (approaching zero probability after softmax).

4. **Attention Weights**:
   ```python
   attention = tf.nn.softmax(score, axis=-1) * masking
   ```
   - Computes the attention weights by applying softmax along the last axis (`-1`) of the `score` matrix. The softmax operation converts scores into probabilities while respecting the mask (`masking`) to zero out masked positions.

5. **Output**:
   ```python
   head = tf.matmul(attention, value)
   ```
   - Computes the weighted sum of `value` vectors using the computed `attention` weights (`softmax scores`), resulting in the `head` tensor.

### Input Arguments:
- `query`: The query tensor representing the queries for the self-attention mechanism.
- `key`: The key tensor representing the keys for the self-attention mechanism.
- `value`: The value tensor representing the values for the self-attention mechanism.
- `masking`: A binary mask tensor indicating which positions in the sequence should be masked (ignored).

### Output:
- `head`: The output tensor after applying the self-attention mechanism, representing the attended values based on the input queries, keys, values, and mask.

This `CustomSelfAttention` class encapsulates the key components of a self-attention layer, providing a customizable and reusable implementation suitable for integration into larger Transformer architectures for sequence processing tasks.

In [None]:
attention=CustomSelfAttention(256)
attention(tf.ones([1,8,256]),tf.ones([1,8,256]),tf.ones([1,8,256]),padding_mask)

To use the `CustomSelfAttention` layer we've defined and apply it with example input tensors, let's walk through how to create an instance of `CustomSelfAttention` and pass input tensors (`query`, `key`, `value`) along with a padding mask (`padding_mask`) to compute the self-attention mechanism.

Given the `CustomSelfAttention` class definition and an example call:

```python
attention = CustomSelfAttention(256)
query = tf.ones([1, 8, 256])
key = tf.ones([1, 8, 256])
value = tf.ones([1, 8, 256])
padding_mask = padding_mask  # Assuming you have defined `padding_mask` appropriately

result = attention(query, key, value, padding_mask)
print(result.shape)
```

Here's what happens step by step:

1. **Create `CustomSelfAttention` Instance**:
   ```python
   attention = CustomSelfAttention(256)
   ```
   - Initialize an instance of `CustomSelfAttention` with `model_size = 256`.

2. **Define Input Tensors**:
   ```python
   query = tf.ones([1, 8, 256])   # Shape: [batch_size=1, num_queries=8, model_size=256]
   key = tf.ones([1, 8, 256])     # Shape: [batch_size=1, num_keys=8, model_size=256]
   value = tf.ones([1, 8, 256])   # Shape: [batch_size=1, num_values=8, model_size=256]
   ```
   - Create example input tensors (`query`, `key`, `value`) each with a batch size of `1`, 8 queries/keys/values, and a model size of `256`.

3. **Compute Self-Attention**:
   ```python
   result = attention(query, key, value, padding_mask)
   ```
   - Pass the input tensors (`query`, `key`, `value`) and the padding mask (`padding_mask`) to the `attention` layer.
   - Inside the `call` method of `CustomSelfAttention`:
     - Compute attention scores (`score`) using matrix multiplication between `query` and `key`.
     - Scale the scores by dividing by the square root of `model_size`.
     - Apply masking by adding a large negative value to masked positions (`padding_mask`) to ignore them during softmax computation.
     - Compute attention weights (`attention`) using softmax over the masked scores.
     - Calculate the attended values (`head`) by applying the attention weights to `value` through matrix multiplication.
   - `result` will be the output of the self-attention mechanism based on the input tensors and padding mask.

4. **Print Result Shape**:
   ```python
   print(result.shape)
   ```
   - This prints the shape of `result`, which represents the output of the self-attention mechanism.
   - The shape will be `[1, 8, 256]`, indicating a batch size of `1`, `8` attended values (corresponding to `num_queries`), and a model size of `256`.

In [None]:
class CustomMultiHeadAttention(Layer):
  def __init__(self,num_heads,key_dim):
    super(CustomMultiHeadAttention,self).__init__()

    self.num_heads=num_heads
    self.dense_q=[Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_k=[Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_v=[Dense(key_dim//num_heads) for _ in range(num_heads)]
    self.dense_o=Dense(key_dim)
    self.self_attention=CustomSelfAttention(key_dim)

  def call(self,query,key,value,attention_mask):
    heads=[]

    for i in range(self.num_heads):
      print("hello", self.dense_q[i](query).shape)
      head=self.self_attention(self.dense_q[i](query),self.dense_k[i](key),
                              self.dense_v[i](value),attention_mask)
      heads.append(head)
    print("head", tf.convert_to_tensor(heads).shape)
    heads=tf.concat(heads,axis=2)
    heads=self.dense_o(heads)
    return heads

The `CustomMultiHeadAttention` class defined is designed to implement the multi-head attention mechanism using a set of dense layers for queries, keys, and values. Let's break down the functionality of this class and how it applies multi-head attention.

### Class Initialization:
```python
class CustomMultiHeadAttention(Layer):
    def __init__(self, num_heads, key_dim):
        super(CustomMultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.dense_q = [Dense(key_dim // num_heads) for _ in range(num_heads)]
        self.dense_k = [Dense(key_dim // num_heads) for _ in range(num_heads)]
        self.dense_v = [Dense(key_dim // num_heads) for _ in range(num_heads)]
        self.dense_o = Dense(key_dim)
        self.self_attention = CustomSelfAttention(key_dim)
```
- `num_heads`: Number of attention heads.
- `key_dim`: Dimensionality of the key vectors.

In the `__init__` method:
- `self.dense_q`, `self.dense_k`, and `self.dense_v` are lists of `Dense` layers used to project the query, key, and value vectors into `key_dim // num_heads` dimensions for each attention head.
- `self.dense_o` is a `Dense` layer used to project concatenated multi-head attention outputs back to the original `key_dim`.
- `self.self_attention` is an instance of `CustomSelfAttention` with `key_dim` as the model size.

### `call` Method:
```python
def call(self, query, key, value, attention_mask):
    heads = []

    for i in range(self.num_heads):
        q = self.dense_q[i](query)
        k = self.dense_k[i](key)
        v = self.dense_v[i](value)
        head = self.self_attention(q, k, v, attention_mask)
        heads.append(head)

    heads = tf.concat(heads, axis=2)
    heads = self.dense_o(heads)
    return heads
```
- `query`, `key`, `value`: Input tensors representing queries, keys, and values.
- `attention_mask`: Mask tensor used to mask out padding tokens during attention computation.

In the `call` method:
1. **Loop Over Attention Heads**:
   - Iterate over each attention head (`num_heads` times).
   - For each head `i`, project `query`, `key`, and `value` using `dense_q[i]`, `dense_k[i]`, and `dense_v[i]` respectively to reduce dimensionality (`key_dim // num_heads`).

2. **Apply Self-Attention**:
   - Pass the projected `query`, `key`, and `value` tensors along with the `attention_mask` to the `self_attention` layer (an instance of `CustomSelfAttention`).

3. **Concatenate Attention Heads**:
   - Concatenate the output heads along the last dimension (`axis=2`), resulting in a tensor of shape `(batch_size, num_queries, key_dim)`.

4. **Output Projection**:
   - Project the concatenated heads back to the original `key_dim` using the `dense_o` layer.

5. **Return Output**:
   - Return the final output tensor after applying multi-head attention and projection.

### Usage Example:
```python
# Create an instance of CustomMultiHeadAttention
multihead_attention = CustomMultiHeadAttention(num_heads=8, key_dim=512)

# Example usage with query, key, value, and attention_mask tensors
query = tf.ones([1, 8, 512])   # Shape: [batch_size=1, num_queries=8, key_dim=512]
key = tf.ones([1, 8, 512])     # Shape: [batch_size=1, num_keys=8, key_dim=512]
value = tf.ones([1, 8, 512])   # Shape: [batch_size=1, num_values=8, key_dim=512]
attention_mask = padding_mask  # Assuming you have defined `padding_mask`

# Apply multi-head attention
result = multihead_attention(query, key, value, attention_mask)
print(result.shape)  # Output shape: (1, 8, 512)
```
- `query`, `key`, and `value` are input tensors with shape `(batch_size=1, num_queries=8, key_dim=512)`.
- `attention_mask` is a mask tensor used to mask out padding tokens during attention computation.
- `result` will be the output tensor after applying multi-head attention and projection, with shape `(1, 8, 512)`.

##Encoder

In [None]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads,):
        super(TransformerEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = CustomMultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim,
        )
        self.dense_proj=tf.keras.Sequential(
            [Dense(dense_dim, activation="relu"),
             Dense(embed_dim),]
        )
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):

      if mask is not None:
        mask = tf.cast(
            mask[:,tf.newaxis, :], dtype="int32")
        T = tf.shape(mask)[2]
        padding_mask = tf.repeat(mask,T,axis=1)
      attention_output = self.attention(
          query=inputs, key=inputs,value=inputs,
          attention_mask=padding_mask
      )

      proj_input = self.layernorm_1(inputs + attention_output)
      proj_output = self.dense_proj(proj_input)
      return self.layernorm_2(proj_input + proj_output)

The `TransformerEncoder` class defined is a part of the Transformer model architecture and represents a single layer of the Transformer encoder. Let's break down the functionality of this class and how it processes input sequences through attention mechanisms and feed-forward networks.

### Class Initialization:
```python
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super(TransformerEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads

        # Multi-head self-attention mechanism
        self.attention = CustomMultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        # Feed-forward network
        self.dense_proj = tf.keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])

        # Layer normalization
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()

        # Set masking support
        self.supports_masking = True
```
- `embed_dim`: The dimensionality of the input embeddings and attention mechanisms.
- `dense_dim`: The dimensionality of the intermediate dense layer in the feed-forward network.
- `num_heads`: The number of attention heads used in the multi-head attention mechanism.

In the `__init__` method:
- `self.attention`: Initializes a `CustomMultiHeadAttention` layer with the specified `num_heads` and `key_dim` (equal to `embed_dim`).
- `self.dense_proj`: Defines a sequential feed-forward network with two dense layers: the first layer (`Dense(dense_dim, activation="relu")`) applies a ReLU activation function, and the second layer (`Dense(embed_dim)`) projects the output back to the original embedding dimension.
- `self.layernorm_1` and `self.layernorm_2`: Layer normalization layers to normalize inputs before and after the attention and feed-forward layers.

### `call` Method:
```python
def call(self, inputs, mask=None):
    if mask is not None:
        mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        T = tf.shape(mask)[2]
        padding_mask = tf.repeat(mask, T, axis=1)

    # Apply multi-head self-attention
    attention_output = self.attention(
        query=inputs, key=inputs, value=inputs, attention_mask=padding_mask
    )

    # Residual connection and layer normalization
    proj_input = self.layernorm_1(inputs + attention_output)

    # Apply feed-forward network
    proj_output = self.dense_proj(proj_input)

    # Residual connection and layer normalization
    return self.layernorm_2(proj_input + proj_output)
```
- `inputs`: The input tensor representing the sequence of embeddings.
- `mask`: An optional mask tensor indicating which positions in the input sequence should be ignored during computation.

In the `call` method:
1. **Masking Preparation**:
   - If `mask` is provided, convert it to the appropriate format (`padding_mask`) to use with the attention mechanism.

2. **Apply Multi-Head Self-Attention**:
   - Use the `CustomMultiHeadAttention` layer (`self.attention`) to compute attention over the input sequence (`query=inputs`, `key=inputs`, `value=inputs`) with the provided `attention_mask`.

3. **Residual Connection and Layer Normalization (1st)**:
   - Add the attention output (`attention_output`) to the input (`inputs`) to form a residual connection.
   - Normalize the result using `self.layernorm_1`.

4. **Apply Feed-Forward Network**:
   - Pass the normalized output (`proj_input`) through the feed-forward network (`self.dense_proj`) to compute the intermediate projection (`proj_output`).

5. **Residual Connection and Layer Normalization (2nd)**:
   - Add the feed-forward output (`proj_output`) to the normalized input (`proj_input`) to form another residual connection.
   - Normalize the final result using `self.layernorm_2` and return the output.

### Usage Example:
```python
# Create an instance of TransformerEncoder
encoder_layer = TransformerEncoder(embed_dim=512, dense_dim=2048, num_heads=8)

# Example usage with input tensor (batch_size=1, sequence_length=8, embed_dim=512)
inputs = tf.random.normal([1, 8, 512])
output = encoder_layer(inputs)
print(output.shape)  # Output shape: (1, 8, 512)
```
- `inputs`: Input tensor representing a sequence of embeddings with shape `(batch_size=1, sequence_length=8, embed_dim=512)`.
- `output`: The encoded output tensor after passing through the `TransformerEncoder` layer.
- The `TransformerEncoder` layer applies multi-head self-attention and feed-forward networks to process the input sequence and returns the encoded output.

In [None]:
encoder_outputs = TransformerEncoder(512,2048,8)(emb_out)
print(encoder_outputs.shape)


In the provided code snippet, you're instantiating a `TransformerEncoder` layer and applying it to some input tensor `emb_out` to obtain the encoder outputs.
### Code Snippet:
```python
encoder_outputs = TransformerEncoder(512, 2048, 8)(emb_out)
print(encoder_outputs.shape)
```

### Explanation:
1. **Instantiate `TransformerEncoder`**:
   ```python
   TransformerEncoder(512, 2048, 8)
   ```
   - Creates an instance of `TransformerEncoder` with the following parameters:
     - `embed_dim=512`: The dimensionality of the input embeddings and attention mechanisms.
     - `dense_dim=2048`: The dimensionality of the intermediate dense layer in the feed-forward network.
     - `num_heads=8`: The number of attention heads used in the multi-head attention mechanism.

2. **Apply `TransformerEncoder` to `emb_out`**:
   ```python
   TransformerEncoder(512, 2048, 8)(emb_out)
   ```
   - Calls the `TransformerEncoder` instance as a function with `emb_out` as the input tensor.
   - `emb_out` is assumed to be a tensor representing the output of an embedding layer, with shape `(batch_size, sequence_length, embed_dim)`.

3. **Compute Encoder Outputs**:
   - The `TransformerEncoder` layer processes the input `emb_out` through multi-head self-attention and feed-forward networks according to its defined `call` method.
   - The resulting `encoder_outputs` represent the encoded sequence after passing through the `TransformerEncoder` layer.

4. **Print Encoder Outputs Shape**:
   ```python
   print(encoder_outputs.shape)
   ```
   - Outputs the shape of `encoder_outputs`, which reflects the shape of the encoded sequence after processing through the `TransformerEncoder` layer.
   - The shape of `encoder_outputs` will typically be `(batch_size, sequence_length, embed_dim)`, where:
     - `batch_size` is the number of input sequences.
     - `sequence_length` is the length of each input sequence.
     - `embed_dim` is the dimensionality of the encoded representations.

### Usage Example:
```python
# Assuming emb_out is an input tensor with shape (batch_size, sequence_length, embed_dim)
import tensorflow as tf

# Define TransformerEncoder instance and apply to emb_out
encoder_outputs = TransformerEncoder(512, 2048, 8)(emb_out)

# Print the shape of the encoder outputs
print(encoder_outputs.shape)
```

### Output:
The printed `encoder_outputs.shape` will indicate the shape of the encoded outputs after passing through the `TransformerEncoder` layer. The specific shape will depend on the input `emb_out` and the parameters (`embed_dim`, `dense_dim`, `num_heads`) used to configure the `TransformerEncoder`.

 This example demonstrates how to apply the `TransformerEncoder` layer within a TensorFlow model to encode input sequences using Transformer-based architecture for tasks like natural language processing (NLP) or sequence modeling.

##Decoder

In [None]:
print(tf.linalg.band_part(
        tf.ones([1,8, 8],dtype=tf.int32),-1,0))

The `tf.linalg.band_part` function is used to create a matrix mask that masks out elements above the main diagonal of a square matrix. Let's break down the usage of this function and understand its output.

### Code Explanation:
```python
print(tf.linalg.band_part(tf.ones([1, 8, 8], dtype=tf.int32), -1, 0))
```

### Function Parameters:
- `tf.ones([1, 8, 8], dtype=tf.int32)`: Creates a 3D tensor (matrix) filled with ones.
  - Shape: `[1, 8, 8]` - Represents a batch of one matrix where each matrix is 8x8 and filled with ones.
  - `dtype=tf.int32`: Specifies the data type of the tensor as integers.

- `tf.linalg.band_part(matrix, num_lower, num_upper)`:
  - `matrix`: The input matrix or tensor.
  - `num_lower`: Number of subdiagonals to keep (including the main diagonal).
    - `-1`: Keep all subdiagonals below the main diagonal.
  - `num_upper`: Number of superdiagonals to keep (including the main diagonal).
    - `0`: Keep only the main diagonal and elements below it.

### Output Explanation:
The `tf.linalg.band_part` function masks out elements above the main diagonal based on the provided parameters (`num_lower` and `num_upper`).

- `tf.linalg.band_part(..., -1, 0)`: Masks out all elements above the main diagonal (including the main diagonal itself) of the input matrix.

### Example Output:
If we consider a sample input tensor `tf.ones([1, 8, 8], dtype=tf.int32)`, the output of `tf.linalg.band_part(..., -1, 0)` will be a tensor where:
- All elements above the main diagonal (and including the main diagonal) are retained as ones (`1`).
- All elements below the main diagonal are set to zero (`0`), effectively creating an upper triangular matrix with ones along the main diagonal and below.

### Example Usage:
```python
import tensorflow as tf

# Create a tensor filled with ones and apply band_part to mask out elements above the main diagonal
masked_matrix = tf.linalg.band_part(tf.ones([1, 8, 8], dtype=tf.int32), -1, 0)

# Print the resulting masked matrix
print(masked_matrix)
```

### Output:
The output of `print(masked_matrix)` will be a tensor representing the masked matrix where all elements above the main diagonal are zeros (`0`), and elements on and below the main diagonal are ones (`1`).

This type of matrix masking is commonly used in sequence processing tasks, such as in Transformer models, to mask out future tokens or pad tokens during self-attention computation, ensuring that only valid positions are attended to during the model's processing.

In [None]:
class TransformerDecoder(Layer):
  def __init__(self, embed_dim, latent_dim, num_heads,):
    super(TransformerDecoder, self).__init__()
    self.embed_dim = embed_dim
    self.latent_dim = latent_dim
    self.num_heads = num_heads
    self.attention_1=MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )
    self.attention_2=MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )
    self.dense_proj = tf.keras.Sequential(
        [Dense(latent_dim, activation="relu"),Dense(embed_dim),]
    )
    self.layernorm_1=LayerNormalization()
    self.layernorm_2=LayerNormalization()
    self.layernorm_3=LayerNormalization()
    self.supports_masking = True
  def call(self, inputs, encoder_outputs, enc_mask, mask=None):


    if mask is not None:
      causal_mask=tf.linalg.band_part(
        tf.ones([tf.shape(inputs)[0],
                 tf.shape(inputs)[1],
                 tf.shape(inputs)[1]],dtype=tf.int32),-1,0)
      mask = tf.cast(
          mask[:,tf.newaxis, :], dtype="int32")
      enc_mask = tf.cast(
          enc_mask[:,tf.newaxis, :], dtype="int32")
      T = tf.shape(mask)[2]
      padding_mask = tf.repeat(mask,T,axis=1)
      cross_attn_mask = tf.repeat(enc_mask,T,axis=1)
      combined_mask=tf.minimum(padding_mask,causal_mask)

    attention_output_1 = self.attention_1(
        query=inputs,key=inputs,value=inputs,
        attention_mask=combined_mask,

    )

    out_1 = self.layernorm_1(inputs + attention_output_1)

    attention_output_2= self.attention_2(
        query=out_1,key=encoder_outputs,value=encoder_outputs,
        attention_mask=cross_attn_mask,

    )
    out_2 = self.layernorm_2(out_1 + attention_output_2)

    proj_output = self.dense_proj(out_2)
    return self.layernorm_3(out_2 + proj_output)

The `TransformerDecoder` class we've implemented represents a decoder layer within a Transformer architecture. Let's break down the functionality of this class and understand how it processes inputs, performs self-attention, cross-attention with encoder outputs, and applies feed-forward transformations.

### Class Initialization:
```python
class TransformerDecoder(Layer):
    def __init__(self, embed_dim, latent_dim, num_heads):
        super(TransformerDecoder, self).__init__()
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads

        # Self-attention mechanism within the decoder
        self.attention_1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        # Cross-attention mechanism between decoder and encoder
        self.attention_2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        # Feed-forward network for projection
        self.dense_proj = tf.keras.Sequential([
            Dense(latent_dim, activation="relu"),
            Dense(embed_dim),
        ])

        # Layer normalization layers
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()

        # Set masking support
        self.supports_masking = True
```
- `embed_dim`: The dimensionality of the input embeddings and attention mechanisms.
- `latent_dim`: The dimensionality of the intermediate dense layer in the feed-forward network.
- `num_heads`: The number of attention heads used in the multi-head attention mechanisms.

In the `__init__` method:
- `self.attention_1`: Initializes a `MultiHeadAttention` layer for self-attention within the decoder.
- `self.attention_2`: Initializes another `MultiHeadAttention` layer for cross-attention between the decoder and encoder.
- `self.dense_proj`: Defines a sequential feed-forward network with two dense layers (`Dense(latent_dim, activation="relu")` followed by `Dense(embed_dim)`) to project the decoder outputs back to the original embedding dimension.
- `self.layernorm_1`, `self.layernorm_2`, `self.layernorm_3`: Layer normalization layers to normalize inputs before and after attention and feed-forward transformations.

### `call` Method:
```python
def call(self, inputs, encoder_outputs, enc_mask, mask=None):
    # Prepare masks for attention mechanisms
    if mask is not None:
        causal_mask = tf.linalg.band_part(tf.ones_like(inputs, dtype=tf.int32), -1, 0)
        mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        enc_mask = tf.cast(enc_mask[:, tf.newaxis, :], dtype="int32")
        T = tf.shape(mask)[2]
        padding_mask = tf.repeat(mask, T, axis=1)
        cross_attn_mask = tf.repeat(enc_mask, T, axis=1)
        combined_mask = tf.minimum(padding_mask, causal_mask)

    # Self-attention within the decoder
    attention_output_1 = self.attention_1(
        query=inputs, key=inputs, value=inputs, attention_mask=combined_mask
    )
    out_1 = self.layernorm_1(inputs + attention_output_1)

    # Cross-attention between decoder and encoder outputs
    attention_output_2 = self.attention_2(
        query=out_1, key=encoder_outputs, value=encoder_outputs, attention_mask=cross_attn_mask
    )
    out_2 = self.layernorm_2(out_1 + attention_output_2)

    # Feed-forward projection
    proj_output = self.dense_proj(out_2)
    return self.layernorm_3(out_2 + proj_output)
```
- `inputs`: The input tensor representing the sequence of decoder embeddings.
- `encoder_outputs`: The encoder outputs used for cross-attention.
- `enc_mask`: The mask tensor for encoder outputs.
- `mask`: An optional mask tensor indicating which positions in the input sequence should be ignored during computation.

In the `call` method:
1. **Prepare Masks**:
   - Constructs masks (`causal_mask`, `padding_mask`, `cross_attn_mask`) for use in attention mechanisms.
   - `causal_mask`: Masks out future tokens to ensure causal self-attention.
   - `padding_mask`, `cross_attn_mask`: Masks out padding tokens and applies cross-attention between decoder and encoder outputs.

2. **Self-Attention within Decoder** (`attention_1`):
   - Applies self-attention to the decoder inputs (`inputs`) using `attention_1` (multi-head attention within the decoder).
   - Normalizes the output using `layernorm_1`.

3. **Cross-Attention between Decoder and Encoder** (`attention_2`):
   - Uses cross-attention to attend to encoder outputs (`encoder_outputs`) from the decoder inputs (`out_1`) using `attention_2`.
   - Normalizes the output using `layernorm_2`.

4. **Feed-Forward Projection** (`dense_proj`):
   - Projects the normalized output (`out_2`) through the feed-forward network (`dense_proj`) to transform the decoder representations.
   - Normalizes the final output using `layernorm_3` and returns the result.

### Usage Example:
```python
import tensorflow as tf

# Create an instance of TransformerDecoder
decoder_layer = TransformerDecoder(embed_dim=512, latent_dim=2048, num_heads=8)

# Example usage with inputs, encoder_outputs, enc_mask, and mask tensors
inputs = tf.random.normal([1, 10, 512])        # Decoder inputs (batch_size=1, sequence_length=10, embed_dim=512)
encoder_outputs = tf.random.normal([1, 8, 512]) # Encoder outputs (batch_size=1, encoder_seq_length=8, embed_dim=512)
enc_mask = tf.ones([1, 1, 8], dtype=tf.int32)    # Encoder mask (batch_size=1, num_heads=1, encoder_seq_length=8)
mask = tf.ones([1, 1, 10], dtype=tf.int32)      # Mask for decoder inputs (batch_size=1, num_heads=1, sequence_length=10)

# Apply TransformerDecoder to process inputs
decoder_outputs = decoder_layer(inputs, encoder_outputs, enc_mask, mask)

# Print the shape of the decoder outputs
print(decoder_outputs.shape)  # Output shape: (1, 10, 512)
```

### Output:
The printed `decoder_outputs.shape` will indicate the shape of the decoder outputs after processing through the `TransformerDecoder` layer. The specific shape will depend on the input tensors (`inputs`, `encoder_outputs`) and the mask tensors (`enc_mask`, `mask`) used during

 the computation.

This example demonstrates how to use the `TransformerDecoder` layer within a Transformer model to decode sequence representations, perform self-attention and cross-attention, and transform decoder inputs into meaningful outputs based on encoder outputs.

In [None]:
enc_mask=mask
decoder_outputs = TransformerDecoder(512,2048,4)(emb_out,encoder_outputs,enc_mask)
print(decoder_outputs.shape)

In the provided code snippet, we are applying the `TransformerDecoder` layer to decode sequence representations based on input embeddings (`emb_out`), encoder outputs (`encoder_outputs`), and a mask (`enc_mask`). Let's analyze how this process works:

### Code Explanation:
```python
enc_mask = mask
decoder_outputs = TransformerDecoder(512, 2048, 4)(emb_out, encoder_outputs, enc_mask)
print(decoder_outputs.shape)
```

### Steps:
1. **Assign Mask (`enc_mask = mask`)**:
   - The variable `enc_mask` is assigned the value of `mask`. Assuming `mask` is a tensor indicating positions to be masked (e.g., padding positions), `enc_mask` will be used as a mask for encoder outputs during cross-attention in the decoder.

2. **Apply `TransformerDecoder`**:
   - Create an instance of `TransformerDecoder` with specified parameters (`embed_dim=512`, `latent_dim=2048`, `num_heads=4`).
   - Call the `TransformerDecoder` instance as a function with input arguments:
     - `emb_out`: Input tensor representing decoder embeddings (shape: `[batch_size, sequence_length, embed_dim]`).
     - `encoder_outputs`: Tensor representing encoder outputs (used for cross-attention) (shape: `[batch_size, encoder_seq_length, embed_dim]`).
     - `enc_mask`: Mask tensor for encoder outputs (shape: `[batch_size, 1, encoder_seq_length]`).

3. **Compute Decoder Outputs**:
   - The `TransformerDecoder` processes the inputs (`emb_out`) with cross-attention to the `encoder_outputs` using the provided `enc_mask`.
   - The `decoder_outputs` represent the decoded sequence outputs after processing through the `TransformerDecoder` layer.

4. **Print Output Shape**:
   - Print the shape of `decoder_outputs` to inspect the dimensions of the decoded sequence.

### Example Usage:
```python
import tensorflow as tf

# Assuming emb_out and encoder_outputs are defined tensors
emb_out = tf.random.normal([1, 10, 512])        # Decoder inputs (batch_size=1, sequence_length=10, embed_dim=512)
encoder_outputs = tf.random.normal([1, 8, 512]) # Encoder outputs (batch_size=1, encoder_seq_length=8, embed_dim=512)
mask = tf.ones([1, 1, 10], dtype=tf.int32)       # Mask for decoder inputs (batch_size=1, num_heads=1, sequence_length=10)

# Assign mask to enc_mask
enc_mask = mask

# Apply TransformerDecoder to decode sequence representations
decoder_outputs = TransformerDecoder(512, 2048, 4)(emb_out, encoder_outputs, enc_mask)

# Print the shape of the decoder outputs
print(decoder_outputs.shape)  # Output shape: (1, 10, 512)
```

### Output:
The printed `decoder_outputs.shape` will represent the shape of the decoded sequence outputs after passing through the `TransformerDecoder` layer. The specific shape `(1, 10, 512)` indicates:
- `1`: Batch size.
- `10`: Sequence length (number of tokens in the decoded sequence).
- `512`: Embedding dimension (dimensionality of the decoded sequence representations).

##Transformer Model

In [None]:
EMBEDDING_DIM=512
D_FF=2048
NUM_HEADS=8
NUM_LAYERS=1
NUM_EPOCHS=10

 Constants related to the dimensions and configurations of a Transformer-based model, including embedding dimension(`EMBEDDING_DIM`),
 feed-forward dimensions (`D_FF`), the number of attention heads (`NUM_HEADS`), the number of layers (`NUM_LAYERS`), and the number of training epochs (`NUM_EPOCHS`).

 These constants are typically used to configure and train a Transformer model for specific tasks, such as natural language processing (NLP) tasks like machine translation or text generation.




In [None]:
encoder_inputs=Input(shape=(None,), dtype="int64", name="input_1")
emb = Embeddings(ENGLISH_SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)
x = emb(encoder_inputs)
enc_mask = emb.compute_mask(encoder_inputs)

for _ in range(NUM_LAYERS):
  x=TransformerEncoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x)
encoder_outputs=x

decoder_inputs=Input(shape=(None,), dtype="int64", name="input_2")

x = Embeddings(FRENCH_SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)(decoder_inputs)
for i in range(NUM_LAYERS):
  x=TransformerDecoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x, encoder_outputs,enc_mask)
x=tf.keras.layers.Dropout(0.5)(x)
decoder_outputs=Dense(VOCAB_SIZE, activation="softmax")(x)

transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)
transformer.summary()

The code provided outlines the construction of a Transformer model using TensorFlow/Keras for a sequence-to-sequence (seq2seq) task, likely machine translation given the encoder-decoder setup.
### Code Breakdown:

1. **Encoder Inputs**:
   - Define an input layer `encoder_inputs` to receive the encoder sequence data.
   - Shape: `(None,)` - Allows variable-length sequences.
   - Data type: `int64` - Represents integer indices of words in the vocabulary.

2. **Embedding Layer (Encoder)**:
   - Instantiate an `Embeddings` layer (`emb`) to convert input indices into dense embeddings.
   - `ENGLISH_SEQUENCE_LENGTH`: Length of the English input sequence.
   - `VOCAB_SIZE`: Size of the vocabulary.
   - `EMBEDDING_DIM`: Dimensionality of the word embeddings.

3. **Compute Mask (Encoder)**:
   - Compute the masking tensor (`enc_mask`) for the encoder inputs using `emb.compute_mask`.
   - The mask is used to ignore padding tokens during self-attention within the Transformer encoder.

4. **Transformer Encoder Layers**:
   - Apply multiple layers of `TransformerEncoder` to the embedded encoder inputs (`x`).
   - Each layer performs multi-head self-attention and feed-forward network transformations.
   - `EMBEDDING_DIM`: Dimensionality of the embeddings.
   - `D_FF`: Dimensionality of the feed-forward layer.
   - `NUM_HEADS`: Number of attention heads.

5. **Encoder Outputs**:
   - Store the final encoder outputs (`encoder_outputs`) after processing through all layers of the Transformer encoder.

6. **Decoder Inputs**:
   - Define an input layer `decoder_inputs` to receive the decoder sequence data.
   - Shape and data type are similar to `encoder_inputs`.

7. **Embedding Layer (Decoder)**:
   - Instantiate another `Embeddings` layer (`Embeddings(FRENCH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIM)`) to convert decoder input indices into dense embeddings.
   - `FRENCH_SEQUENCE_LENGTH`: Length of the French input sequence.

8. **Transformer Decoder Layers**:
   - Apply multiple layers of `TransformerDecoder` to the embedded decoder inputs (`x`).
   - Each layer performs self-attention and cross-attention with the encoder outputs (`encoder_outputs`).
   - `EMBEDDING_DIM`: Dimensionality of the embeddings.
   - `D_FF`: Dimensionality of the feed-forward layer.
   - `NUM_HEADS`: Number of attention heads.

9. **Dropout and Dense Output Layer (Decoder)**:
   - Apply dropout (`tf.keras.layers.Dropout(0.5)`) to prevent overfitting.
   - Project the decoder outputs to the vocabulary size using `Dense(VOCAB_SIZE, activation="softmax")` for probability distribution over vocabulary words.

10. **Define Transformer Model**:
    - Create a Keras `Model` with `encoder_inputs` and `decoder_inputs` as inputs and `decoder_outputs` as the output.
    - Name the model as "transformer".

11. **Print Model Summary**:
    - Use `transformer.summary()` to display the architecture of the Transformer model, including layer types, output shapes, and trainable parameters.

### Example Usage:
```python
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from YourCustomModules import Embeddings, TransformerEncoder, TransformerDecoder

# Constants
ENGLISH_SEQUENCE_LENGTH = 100
FRENCH_SEQUENCE_LENGTH = 120
VOCAB_SIZE = 10000
EMBEDDING_DIM = 512
D_FF = 2048
NUM_HEADS = 8
NUM_LAYERS = 6

# Encoder Inputs
encoder_inputs = Input(shape=(None,), dtype="int64", name="input_1")
emb = Embeddings(ENGLISH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIM)
x = emb(encoder_inputs)
enc_mask = emb.compute_mask(encoder_inputs)

# Transformer Encoder Layers
for _ in range(NUM_LAYERS):
    x = TransformerEncoder(EMBEDDING_DIM, D_FF, NUM_HEADS)(x)
encoder_outputs = x

# Decoder Inputs
decoder_inputs = Input(shape=(None,), dtype="int64", name="input_2")
x = Embeddings(FRENCH_SEQUENCE_LENGTH, VOCAB_SIZE, EMBEDDING_DIM)(decoder_inputs)

# Transformer Decoder Layers
for _ in range(NUM_LAYERS):
    x = TransformerDecoder(EMBEDDING_DIM, D_FF, NUM_HEADS)(x, encoder_outputs, enc_mask)
x = Dropout(0.5)(x)
decoder_outputs = Dense(VOCAB_SIZE, activation="softmax")(x)

# Define Transformer Model
transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
transformer.summary()
```


#Training

In [None]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.bleu_score=0

    def update_state(self,y_true,y_pred,sample_weight=None):
      y_pred=tf.argmax(y_pred,-1)
      self.bleu_score=0
      for i,j in zip(y_pred,y_true):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches=0
        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if word==j[q]:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break

        self.bleu_score+=total_matches/total_words

    def result(self):
        return self.bleu_score/BATCH_SIZE

Custom implementation of a BLEU metric in TensorFlow/Keras. The BLEU (Bilingual Evaluation Understudy) score is commonly used to evaluate the quality of machine translation or text generation models.

### Issues with Current Implementation:

1. **Initialization of `bleu_score` in `__init__`**:
   - The `bleu_score` variable is initialized to `0` in the constructor (`__init__`). However, BLEU should accumulate scores across batches and then compute the final score in `result()`. Initializing it to `0` in `__init__` will reset the score every time a new instance of the metric is created.

2. **Update State Function (`update_state`)**:
   - The `update_state` function is responsible for computing BLEU scores based on `y_true` (ground truth) and `y_pred` (predicted) sequences.
   - You are currently trying to compute BLEU scores by comparing each prediction (`y_pred`) with its corresponding ground truth (`y_true`). However, the logic for calculating BLEU scores and matching n-grams is incomplete and needs improvement.

3. **Result Calculation (`result`)**:
   - The `result` function should return the computed BLEU score based on accumulated statistics from `update_state`. However, the current implementation does not correctly accumulate or average BLEU scores across batches.

### Suggestions for Improvement:

Here's an updated version of the `BLEU` metric class with improved logic for computing BLEU scores:

```python
import tensorflow as tf
from collections import Counter

class BLEU(tf.keras.metrics.Metric):
    def __init__(self, name='bleu_score'):
        super(BLEU, self).__init__(name=name)
        self.total_score = self.add_weight(name='total_score', initializer='zeros')
        self.total_samples = self.add_weight(name='total_samples', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Convert y_pred to argmax indices
        y_pred_indices = tf.argmax(y_pred, axis=-1)

        batch_bleu = 0.0
        batch_size = tf.shape(y_true)[0]

        for i in range(batch_size):
            reference = y_true[i, :]
            candidate = y_pred_indices[i, :]

            # Calculate BLEU for individual example
            bleu = self.calculate_bleu(reference, candidate)
            batch_bleu += bleu
        
        # Update total score and total samples count
        self.total_score.assign_add(batch_bleu)
        self.total_samples.assign_add(tf.cast(batch_size, tf.float32))

    def result(self):
        # Compute average BLEU score over all samples
        return self.total_score / self.total_samples

    def calculate_bleu(self, reference, candidate):
        reference = tf.boolean_mask(reference, reference != 0)
        candidate = tf.boolean_mask(candidate, candidate != 0)

        if len(candidate) == 0:
            return 0.0

        clipped_counts = Counter(candidate) & Counter(reference)
        total_matches = sum(clipped_counts.values())
        total_words = len(candidate)

        bleu_score = total_matches / total_words
        return bleu_score
```

### Key Improvements and Changes:

- **Initialization of Accumulators (`total_score`, `total_samples`)**:
  - Use `self.add_weight` to initialize accumulators for total BLEU score and total number of samples processed.
  
- **Improved `update_state` Function**:
  - Iterate over each example in the batch.
  - Calculate BLEU score for each example using `calculate_bleu`.
  - Accumulate batch BLEU scores into `total_score` and update `total_samples`.

- **Correct Calculation of BLEU**:
  - The `calculate_bleu` function computes BLEU score based on matching n-grams between reference and candidate sequences.
  - Use `Counter` to efficiently count n-gram matches and compute precision for BLEU.

- **Result Calculation (`result`)**:
  - Compute the average BLEU score across all samples processed (`total_score / total_samples`).

### Usage Example:
```python
# Create an instance of the BLEU metric
bleu_metric = BLEU()

# Compute BLEU scores during model training or evaluation
for (encoder_inputs, decoder_inputs), targets in dataset:
    predictions = model([encoder_inputs, decoder_inputs])
    bleu_metric.update_state(targets, predictions)

# Get the final BLEU score
final_bleu_score = bleu_metric.result()
print("Final BLEU Score:", final_bleu_score.numpy())
```

In [None]:
class Scheduler(LearningRateSchedule):
  def __init__(self, d_model, warmup_steps):
    super(Scheduler, self).__init__()
    self.d_model = tf.cast(d_model, tf.float64)
    self.warmup_steps = tf.cast(warmup_steps, dtype=tf.float64)

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float64)
    return (self.d_model**(-0.5))*tf.math.minimum(step**(-0.5), step * (self.warmup_steps ** -1.5))


### Explanation of the Scheduler Class:

1. **Initialization**:
   - The `Scheduler` class inherits from `LearningRateSchedule`.
   - `d_model`: The model's dimensionality, typically the size of the hidden layers in the Transformer.
   - `warmup_steps`: The number of warmup steps during which the learning rate increases linearly.

2. **`__call__` Method**:
   - The `__call__` method is invoked to compute the learning rate at a given training step (`step`).
   - `step` is cast to `tf.float64` for precision in calculations.
   - The learning rate formula used here is derived from the Transformer's learning rate schedule:
     \[
     \text{lr} = \frac{1}{\text{d\_model}^{\frac{1}{2}}} \cdot \text{min}\left(\text{step}^{\frac{-1}{2}}, \text{step} \cdot \text{warmup\_steps}^{\frac{-1.5}{2}}\right)
     \]
   - This formula combines two components:
     - The `step^(-0.5)` decay term, which decays as the training progresses.
     - The `step * (warmup_steps^(-1.5))` warmup term, which increases the learning rate linearly during the warmup phase and then decays.
   - The `tf.math.minimum` function is used to select the smaller value between these two components.


In [None]:
WARM_UP_STEPS = 4000
lr_scheduled = Scheduler(EMBEDDING_DIM, WARM_UP_STEPS)

In [None]:
transformer.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = Adam(lr_scheduled, beta_1=0.9, beta_2=0.98, epsilon=1e-9),)
    #metrics=[BLEU()],
    #run_eagerly=True)

In [None]:
history=transformer.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10)

In [None]:
transformer.save_weights('/content/drive/MyDrive/NLP Repository/Projects/Neural_machine_translation/transformers.h5')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
transformer.evaluate(val_dataset)

#Testing

In [None]:
index_to_word={x:y for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

This code constructs a dictionary `index_to_word` that maps indices to corresponding words in a vocabulary. This type of dictionary is commonly used in natural language processing tasks, such as machine translation, where you need to map token indices back to their original words for human-readable output or evaluation.

### Explanation of the Code:

1. **`french_vectorize_layer`**:
   - `french_vectorize_layer` appears to be a text vectorization layer used to convert text data into token indices.
   - It likely contains a vocabulary of words based on the text data it was trained on.

2. **`get_vocabulary()` Method**:
   - The `get_vocabulary()` method retrieves the vocabulary list from the `french_vectorize_layer`.
   - This vocabulary list contains words sorted by frequency (most frequent to least frequent) based on the training data used to build the vectorization layer.

3. **Dictionary Comprehension** (`{x: y for x, y in ...}`):
   - The dictionary comprehension iterates over pairs of `index` (from `range(len(vocabulary))`) and `word` (from `vocabulary`).
   - `range(len(vocabulary))` generates indices corresponding to the vocabulary items.
   - `french_vectorize_layer.get_vocabulary()` returns the list of words in the vocabulary.

4. **`index_to_word` Dictionary**:
   - The resulting `index_to_word` dictionary maps each index (`x`) to its corresponding word (`y`) in the vocabulary.
   - This dictionary allows you to easily look up the original word given a token index.

### Example Usage:
After constructing `index_to_word`, you can use it to decode token sequences back into human-readable text. Here's an example of how you might use `index_to_word`:

```python
# Assuming 'index_to_word' has been constructed as described...

# Example token sequence (list of integers representing tokens)
token_sequence = [1, 4, 7, 0, 2, 3]

# Convert token sequence to human-readable text using 'index_to_word'
decoded_text = ' '.join(index_to_word[token] for token in token_sequence if token in index_to_word)

print("Decoded Text:", decoded_text)
```

In this example:
- We assume `index_to_word` is already populated with mappings from token indices to words.
- We have a `token_sequence` (list of token indices).
- We use a list comprehension with `join` to map each token index in `token_sequence` to its corresponding word using `index_to_word`.
- The resulting `decoded_text` represents the original human-readable text decoded from the token sequence.

In [None]:
def translator(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken'

  for i in range(FRENCH_SEQUENCE_LENGTH):
    tokenized_shifted_target=french_vectorize_layer([shifted_target])
    output=transformer.predict([tokenized_english_sentence,tokenized_shifted_target])
    french_word_index=tf.argmax(output,axis=-1)[0][i].numpy()
    current_word=index_to_word[french_word_index]
    if current_word=='endtoken':
      break
    shifted_target+=' '+current_word
  return shifted_target[11:]

 Implementing a translation function `translator` using a pre-trained Transformer model (`transformer`). This function takes an English sentence, tokenizes it using an `english_vectorize_layer`, and generates a corresponding French translation by iteratively predicting each word using the Transformer model.

### Code Explanation:

1. **Tokenization of Input Sentence**:
   - `tokenized_english_sentence`: Tokenizes the input `english_sentence` using `english_vectorize_layer`, which converts the text into a sequence of token indices suitable for model input.

2. **Initialization of `shifted_target`**:
   - `shifted_target`: Starts with the token `'starttoken'` to indicate the beginning of the target (French) sentence generation.

3. **Translation Loop** (`for i in range(FRENCH_SEQUENCE_LENGTH)`):
   - Iterates over a fixed number of steps (`FRENCH_SEQUENCE_LENGTH`) to generate the translated sentence.
   - For each iteration:
     - Tokenizes the `shifted_target` to obtain `tokenized_shifted_target`.
     - Uses the pre-trained Transformer model (`transformer.predict`) to predict the next word in the French translation based on the tokenized English sentence (`tokenized_english_sentence`) and the current tokenized French sequence (`tokenized_shifted_target`).
     - Selects the most probable word (by taking the `argmax` of the output probabilities) and retrieves its corresponding word using `index_to_word`.
     - Appends the predicted word to `shifted_target` to form the next input for the next iteration.
     - Terminates the loop if the predicted word is `'endtoken'`, indicating the end of the translation.

4. **Final Translation**:
   - Returns the translated French sentence by removing the initial `'starttoken'` prefix (`shifted_target[11:]`).

### Example Usage:
To use the `translator` function for translating English sentences to French using your pre-trained Transformer model (`transformer`), you would call the function with an English sentence as input. Here's an example:

```python
english_sentence = "How are you?"
translated_french_sentence = translator(english_sentence)
print("Translated French Sentence:", translated_french_sentence)
```

In [None]:
translator('What makes you think that it is not true?')

In [None]:
translator('Have you ever watched soccer under the rain?')

In [None]:
translator("what is your name?")

In [None]:
translator('Great trees do not grow with ease, the stronger the winds, the stronger the trees')

In [None]:
translator('My hotel told me to call you. ')

In [None]:
translator('His French is improving little by little')

In [None]:
translator('I love to write')

In [None]:
translator('Perhaps she will come tomorrow')

In [None]:
translator('Tom has never heard Mary sing.')

In [None]:
translator('She handed him the money')

#Visualization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def visualize(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken je lai fait très bien'

  tokenized_shifted_target=french_vectorize_layer([shifted_target])
  attention_weights=attention_score_model.predict([tokenized_english_sentence,
                                                   tokenized_shifted_target])

  return attention_weights

out=visualize('I did it very well')


In [None]:
print(out['decoder_layer1_block2'][0].shape)

In [None]:
plt.figure(figsize = (12,12))

for i in range(NUM_HEADS):
  ax = plt.subplot(2,4, i+1)

  plt.imshow(out['decoder_layer1_block2'][0][i][0:10,0:10])
  plt.title("Attention Scores for head:->"+str(i+1))