#IMport Statements

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input,GlobalMaxPooling1D,Embedding,TextVectorization,LayerNormalization,MultiHeadAttention)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

In [None]:
BATCH_SIZE=64

#Data Preparation

##Load the data

In [None]:
train_ds,val_ds,test_ds=tfds.load('imdb_reviews', split=['train', 'test[:50%]', 'test[50%:]'],as_supervised=True)

In [None]:
train_ds

In [None]:
for review,label in val_ds.take(2):
  print(review)
  print(label)

##Process the Data

In [None]:
def standardization(input_data):
    '''
    Input: raw reviews
    output: standardized reviews
    '''
    lowercase=tf.strings.lower(input_data)
    no_tag=tf.strings.regex_replace(lowercase,"<[^>]+>","")
    output=tf.strings.regex_replace(no_tag,"[%s]"%re.escape(string.punctuation),"")

    return output

In [None]:
standardization(tf.constant("<u>In the movie?, </u>man called Tévèz, went to a friend’s pl**ce and they had a tensed discussion. I don’t love this movie! would you?<br> <br /><br />T"))

In [None]:
VOCAB_SIZE=10000
SEQUENCE_LENGTH=250
EMBEDDING_DIM=300

In [None]:
vectorize_layer=TextVectorization(
    standardize=standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH
)

In [None]:
training_data=train_ds.map(lambda x,y:x)
vectorize_layer.adapt(training_data)

In [None]:
len(vectorize_layer.get_vocabulary())

In [None]:
def vectorizer(review,label):
    return vectorize_layer(review),label

In [None]:
train_dataset=train_ds.map(vectorizer)
val_dataset=val_ds.map(vectorizer)

In [None]:
vectorize_layer.get_vocabulary()[411]

In [None]:
for review,label in train_dataset.take(1):
  print(review)
  print(label)

In [None]:
train_dataset=train_dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
val_dataset=val_dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

#Modeling

##Transformers

###Embeddings

In [None]:
def positional_encoding(model_size,SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros((model_size))
    for i in range(model_size):
      if i%2==0:
        PE[i]=np.sin(pos/(10000**(i/model_size)))
      else:
        PE[i]=np.cos(pos/(10000**((i-1)/model_size)))
    output.append(tf.expand_dims(PE,axis=0))
  out=tf.concat(output,axis=0)
  out=tf.expand_dims(out,axis=0)
  return tf.cast(out,dtype=tf.float32)

The `positional_encoding` function provided is used to generate positional encodings for Transformer models. Positional encodings are added to the input embeddings to provide positional information to the model, enabling it to understand the order or sequence of tokens within an input sequence.

### Function Explanation:

1. **Inputs**:
   - `model_size`: The dimensionality of the input embeddings or the hidden size of the Transformer model.
   - `SEQUENCE_LENGTH`: The maximum sequence length for which positional encodings are generated.

2. **Initialization**:
   - Initialize an empty list `output` to collect positional encodings for each position in the sequence.

3. **Positional Encoding Calculation**:
   - Iterate over each position `pos` in the sequence range `[0, SEQUENCE_LENGTH)`:
     - Create a numpy array `PE` (Positional Encoding) initialized with zeros and of size `model_size`.
     - Iterate over each dimension `i` in the `model_size`:
       - If `i` is even (`i % 2 == 0`), compute the sine-based positional encoding using the formula:
         \[
         \text{PE}[i] = \sin\left(\frac{\text{pos}}{10000^{i / \text{model\_size}}}\right)
         \]
       - If `i` is odd (`i % 2 != 0`), compute the cosine-based positional encoding using the formula:
         \[
         \text{PE}[i] = \cos\left(\frac{\text{pos}}{10000^{(i-1) / \text{model\_size}}}\right)
         \]
     - Append the positional encoding `PE` as a new axis (`tf.expand_dims(PE, axis=0)`) to the `output` list.

4. **Concatenation and Reshaping**:
   - Concatenate all positional encodings in the `output` list along the `axis=0` (sequence axis) to create a tensor `out`.
   - Expand the dimensions of `out` to include a batch dimension (`tf.expand_dims(out, axis=0)`).

5. **Data Type Casting**:
   - Cast the resulting tensor `out` to `tf.float32` data type using `tf.cast`.

6. **Return**:
   - Return the final positional encoding tensor `out`.

### Key Points:

- **Positional Encoding Formula**:
  - The positional encoding formula incorporates sine and cosine functions with varying frequencies based on the position (`pos`) and dimension (`i`) within the embedding space.
  - Using sine and cosine functions ensures that the positional encodings have unique patterns that can convey positional information to the model.

- **Dimensional Interpretation**:
  - Each dimension (`i`) of the positional encoding tensor corresponds to a different frequency pattern, enabling the model to differentiate between positions effectively.

- **Sequence Length Consideration**:
  - The positional encoding tensor generated (`out`) will have a shape of `(1, SEQUENCE_LENGTH, model_size)`, suitable for adding to input embeddings of sequences with maximum length `SEQUENCE_LENGTH`.



In [None]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embed_dim,):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(
        input_dim=vocab_size, output_dim=embed_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim

  def call(self, inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions=positional_encoding(
        self.embed_dim,self.sequence_length)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)

  def get_config(self):
      config = super().get_config()
      config.update({
        "sequence_length": self.sequence_length,
        "vocab_size": self.vocab_size,
        "embed_dim": self.embed_dim,
      })
      return config


The `Embeddings` class we provided is a custom layer in TensorFlow/Keras that combines token embeddings with positional encodings, typically used in Transformer architectures for natural language processing tasks. Here's a breakdown of how this layer works:

### Class Initialization (`__init__`):
- **Parameters**:
  - `sequence_length`: Maximum length of input sequences.
  - `vocab_size`: Size of the vocabulary (number of unique tokens).
  - `embed_dim`: Dimensionality of the token embeddings.

- **Attributes**:
  - `token_embeddings`: An `Embedding` layer that maps token indices to dense embeddings of size `(vocab_size, embed_dim)`.
  - `sequence_length`, `vocab_size`, `embed_dim`: Store input parameters as attributes of the layer.

### `call` Method:
- **Input**:
  - `inputs`: Tensor representing input sequences (token indices).

- **Output**:
  - Computes token embeddings (`embedded_tokens`) using the `Embedding` layer (`token_embeddings`).
  - Generates positional encodings (`embedded_positions`) using a custom `positional_encoding` function.
  - Returns the sum of token embeddings and positional encodings as the output.

### `compute_mask` Method:
- **Input**:
  - `inputs`: Tensor representing input sequences.

- **Output**:
  - Computes a mask that marks non-padding tokens (tokens not equal to 0).

### `get_config` Method:
- **Output**:
  - Generates a dictionary (`config`) containing the layer's configuration, including `sequence_length`, `vocab_size`, and `embed_dim`.
  - Updates the base configuration using `super().get_config()` and adds custom attributes.

### Usage Example:
You can use the `Embeddings` layer within a Transformer model or any sequence processing model in TensorFlow/Keras. Here's how you might use it:

```python
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Define input sequence tensor
input_sequence = Input(shape=(sequence_length,), dtype='int32', name='input_sequence')

# Create an instance of the Embeddings layer
embeddings_layer = Embeddings(sequence_length=sequence_length, vocab_size=vocab_size, embed_dim=embed_dim)

# Apply the embeddings layer to the input sequence
embedded_output = embeddings_layer(input_sequence)

# Example: Add additional layers to the model
dense_layer = Dense(units=128, activation='relu')(embedded_output)
output_layer = Dense(units=num_classes, activation='softmax')(dense_layer)

# Create the model
model = Model(inputs=input_sequence, outputs=output_layer)

# Compile the model with appropriate loss and optimizer
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()
```

In this example:
- `input_sequence` represents the input tensor with shape `(None, sequence_length)`, where `None` indicates a variable batch size.
- The `Embeddings` layer (`embeddings_layer`) is initialized with specified parameters (`sequence_length`, `vocab_size`, `embed_dim`).
- The `embedded_output` is obtained by applying the `embeddings_layer` to `input_sequence`.
- Additional layers (e.g., `Dense`) can be added on top of the embeddings for further processing.
- Finally, a full model (`model`) is created, compiled, and ready for training or inference.

In [None]:
test_input=tf.constant([[  2, 112,   10,   12,  5,   0,   0,   0,]])

emb=Embeddings(8,20000,256)
emb_out=emb(test_input)
print(emb_out.shape)



### Given Input and Embeddings Layer Configuration:

- **Input**:
  ```python
  test_input = tf.constant([[2, 112, 10, 12, 5, 0, 0, 0]])
  ```
  This input represents a batch of sequences with a single sequence `[2, 112, 10, 12, 5, 0, 0, 0]`.

- **Embeddings Layer Configuration** (`Embeddings(8, 20000, 256)`):
  - `sequence_length`: `8`
  - `vocab_size`: `20000`
  - `embed_dim`: `256`

### Steps to Compute `emb_out`:

1. **Token Embeddings** (`embedded_tokens`):
   - The `Embeddings` layer (`emb`) first computes token embeddings (`embedded_tokens`) for the input sequence using the `token_embeddings` layer (an instance of `tf.keras.layers.Embedding`).
   - Each token in the input sequence is replaced with its corresponding embedding vector.
   - The shape of `embedded_tokens` will be `(1, sequence_length, embed_dim)`, where `1` represents the batch size (single sequence), `sequence_length` is `8`, and `embed_dim` is `256`.

2. **Positional Encodings** (`embedded_positions`):
   - The `Embeddings` layer also generates positional encodings (`embedded_positions`) using a function like `positional_encoding(embed_dim, sequence_length)`.
   - The shape of `embedded_positions` will be `(1, sequence_length, embed_dim)`, matching the shape of `embedded_tokens`.

3. **Combining Embeddings and Positions** (`emb_out`):
   - The final output (`emb_out`) is obtained by adding the token embeddings (`embedded_tokens`) and the positional encodings (`embedded_positions`) element-wise.
   - This operation is performed using TensorFlow's broadcasting rules, where each element in `embedded_tokens` is added to its corresponding element in `embedded_positions`.
   - The resulting shape of `emb_out` will also be `(1, sequence_length, embed_dim)`.

### Output Shape of `emb_out`:
- After combining token embeddings and positional encodings, the shape of `emb_out` will be `(1, 8, 256)`.
```


###Encoder

In [None]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads,):
        super(TransformerEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim,
        )
        self.dense_proj=tf.keras.Sequential(
            [Dense(dense_dim, activation="relu"),Dense(embed_dim),]
        )
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
      if mask is not None:
        mask1 = mask[:, :, tf.newaxis]
        mask2 = mask[:,tf.newaxis, :]
        padding_mask = tf.cast(mask1&mask2, dtype="int32")

      attention_output = self.attention(
          query=inputs, key=inputs,value=inputs,attention_mask=padding_mask
      )

      proj_input = self.layernorm_1(inputs + attention_output)
      proj_output = self.dense_proj(proj_input)
      return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
      config = super().get_config()
      config.update({
        "embed_dim": self.embed_dim,
        "num_heads": self.num_heads,
        "dense_dim": self.dense_dim,
      })
      return config

The `TransformerEncoder` class defined is a custom layer representing the encoder block in a Transformer model. Let's break down the key components and functionality of this layer:

### Class Initialization (`__init__`):
- **Parameters**:
  - `embed_dim`: Dimensionality of the input embeddings.
  - `dense_dim`: Dimensionality of the intermediate dense layer within the feed-forward network.
  - `num_heads`: Number of attention heads in the multi-head attention mechanism.

- **Attributes**:
  - `embed_dim`, `dense_dim`, `num_heads`: Store input parameters as attributes of the layer.
  - `attention`: Instance of `MultiHeadAttention` layer, configured with the specified number of heads (`num_heads`) and key dimension (`embed_dim`).
  - `dense_proj`: Sequential model consisting of two `Dense` layers: the first with `dense_dim` units and ReLU activation, and the second with `embed_dim` units (to match the input embedding dimension).
  - `layernorm_1`, `layernorm_2`: Layer normalization layers applied before and after the feed-forward network.
  - `supports_masking`: Indicates that the layer supports masking of input sequences.

### `call` Method:
- **Inputs**:
  - `inputs`: Input tensor representing a sequence of embeddings (shape: `[batch_size, sequence_length, embed_dim]`).
  - `mask`: Optional input mask tensor to handle padding masks (shape: `[batch_size, sequence_length]`).

- **Masking**:
  - If a `mask` tensor is provided (`mask is not None`), it is used to create a 2D padding mask (`padding_mask`) that can be applied during the attention computation.
  - The padding mask (`padding_mask`) is used to mask out padded elements in the input sequences during attention computation.

- **Attention Computation**:
  - The input `inputs` tensor is passed through the `attention` layer (multi-head attention mechanism), where the same input is used for `query`, `key`, and `value`.
  - The `attention_mask` (if provided) masks out specific elements in the attention computation based on the `padding_mask`.

- **Layer Normalization and Feed-Forward Network**:
  - The output of the attention mechanism (`attention_output`) is added to the input (`inputs`) and normalized using `layernorm_1`.
  - The result (`proj_input`) is passed through the feed-forward network (`dense_proj`), consisting of two `Dense` layers.
  - The output of the feed-forward network (`proj_output`) is added back to `proj_input` and normalized using `layernorm_2`.

- **Output**:
  - Returns the final output tensor (`proj_input + proj_output`) after layer normalization.

### `get_config` Method:
- **Output**:
  - Generates a dictionary (`config`) containing the layer's configuration, including `embed_dim`, `num_heads`, and `dense_dim`.
  - Updates the base configuration using `super().get_config()` and adds custom attributes.

In [None]:
encoder_outputs = TransformerEncoder(256,2048,2)(emb_out)
print(encoder_outputs.shape)



### Given Configuration:

- **Input Shape** (`emb_out`):
  - `emb_out` has a shape of `(batch_size, sequence_length, embed_dim)`, where:
    - `batch_size` is inferred from the input data.
    - `sequence_length` is the length of the input sequence.
    - `embed_dim` is the dimensionality of the input embeddings.

- **TransformerEncoder Configuration**:
  - `embed_dim`: `256`
  - `dense_dim`: `2048`
  - `num_heads`: `2`

### Steps to Compute `encoder_outputs`:

1. **Instantiation of `TransformerEncoder`**:
   - Create an instance of `TransformerEncoder` with the specified parameters (`embed_dim=256`, `dense_dim=2048`, `num_heads=2`).

2. **Applying `TransformerEncoder` to `emb_out`**:
   - Pass the input tensor `emb_out` through the `TransformerEncoder` layer.
   - The `TransformerEncoder` layer will perform the following operations:
     - Apply multi-head self-attention using the `MultiHeadAttention` mechanism with `num_heads=2` and `key_dim=256`.
     - Normalize the attention output using layer normalization (`layernorm_1`).
     - Process the normalized output through a feed-forward neural network (`dense_proj`) with a hidden dimension of `2048`.
     - Apply another layer normalization (`layernorm_2`) to the output of the feed-forward network.
   
3. **Output Shape** (`encoder_outputs`):
   - The shape of `encoder_outputs` will match the shape of the input `emb_out`, which is `(batch_size, sequence_length, embed_dim)`.
   - Therefore, `encoder_outputs.shape` will be `(batch_size, sequence_length, embed_dim)`.


###Transformer Model

In [None]:
EMBEDDING_DIM=128
D_FF=1024
NUM_HEADS=8
NUM_LAYERS=1
NUM_EPOCHS=20


- **EMBEDDING_DIM**: This parameter defines the dimensionality of the token embeddings. In Transformer models, each token in the input sequence is initially represented as a dense vector of `EMBEDDING_DIM` dimensions.

- **D_FF (Feed-Forward Dimension)**: This parameter specifies the dimensionality of the intermediate layer in the feed-forward network used within each Transformer block. Typically, this intermediate dimension is larger than the `EMBEDDING_DIM` to allow for more complex transformations.

- **NUM_HEADS**: The number of attention heads used in the multi-head attention mechanism. Each attention head allows the model to focus on different parts of the input sequence, enhancing its ability to capture dependencies and relationships within the data.

- **NUM_LAYERS**: The number of stacked Transformer encoder or decoder layers in the model. Increasing the number of layers can improve the model's capacity to learn complex patterns but may also increase computational cost and risk overfitting.

- **NUM_EPOCHS**: The number of epochs (full passes through the training data) used during model training. Each epoch involves one forward pass (computing predictions), calculating the loss, and updating the model's parameters (weights) through backpropagation.
    
   



In [None]:
encoder_input=Input(shape=(None,), dtype="int64", name="input")
x = Embeddings(SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)(encoder_input)

for _ in range(NUM_LAYERS):
  x=TransformerEncoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x)

x = Flatten()(x)
output=Dense(1, activation="sigmoid")(x)

transformer = tf.keras.Model(
    encoder_input, output, name="transformer"
)
transformer.summary()


### Components of the Model:

1. **Input Layer** (`encoder_input`):
   - This layer defines the input shape of your model, where sequences of integers (token indices) are expected. The shape is `(batch_size, sequence_length)`.

2. **Embeddings Layer** (`Embeddings`):
   - This layer converts input token indices into dense vectors (`EMBEDDING_DIM` dimensions) using trainable embedding weights.
   - Parameters:
     - `SEQUENCE_LENGTH`: Length of input sequences.
     - `VOCAB_SIZE`: Size of the vocabulary (total number of unique tokens).
     - `EMBEDDING_DIM`: Dimensionality of the token embeddings.

3. **TransformerEncoder Blocks** (`TransformerEncoder`):
   - A loop (`for _ in range(NUM_LAYERS)`) that stacks `NUM_LAYERS` of `TransformerEncoder` layers.
   - Each `TransformerEncoder` layer processes the input sequence using self-attention and feed-forward networks, capturing dependencies within the sequence.
   - Parameters:
     - `EMBEDDING_DIM`: Dimensionality of the input embeddings.
     - `D_FF`: Dimensionality of the intermediate dense layer in the feed-forward network within each Transformer block.
     - `NUM_HEADS`: Number of attention heads in the multi-head attention mechanism.

4. **Flatten Layer** (`Flatten`):
   - This layer reshapes the output tensor from the last `TransformerEncoder` block into a 1D tensor.
   - Necessary for connecting the output to a `Dense` layer for classification.

5. **Output Layer** (`Dense` with `sigmoid` activation):
   - This `Dense` layer with a single unit and sigmoid activation function is commonly used for binary classification tasks.
   - It produces a scalar output (probability) representing the likelihood of the input sequence belonging to the positive class.

6. **Model Compilation and Summary**:
   - The model is defined using `tf.keras.Model`, specifying the input (`encoder_input`) and output (`output`) layers.
   - The model is then compiled, typically with a suitable optimizer (e.g., Adam) and loss function (e.g., binary cross-entropy) for binary classification tasks.
   - Finally, the model summary is displayed, showing the architecture and parameter counts.

###LSH *Attention*

In [None]:
def look_one_back(x):
  x_extra=tf.concat([x[:,-1:,...],x[:,:-1,...]],axis=1)
  return tf.concat([x,x_extra],axis=2)

def sticker_look_one_back(x):
  x_extra=tf.concat([x[:-1:],x[:,:-1]],axis=1)
  return tf.concat([x,x_extra],axis=-1)

def causal_masker(a,b):
  a,b=tf.cast(a,dtype=tf.float32)+0.01,tf.cast(b,dtype=tf.float32)+0.01
  vals=tf.einsum('ipj,ipk->ipjk',b,1/a)
  out=tf.cast(tf.cast(tf.cast(vals,dtype=tf.int32),dtype=tf.bool),dtype=tf.int32)
  out=-out+1
  return tf.cast(out,dtype=tf.float32)

class LSHAttention(tf.keras.layers.Layer):
    def __init__(self,bucket_size=8,n_hashes=1):
        super(LSHAttention,self).__init__()
        self.n_hashes=n_hashes
        self.bucket_size=bucket_size

    def call(self,query,key,value,causal_masking=False):
        R=tf.random.normal((tf.shape(query)[0],tf.shape(query)[-1],self.bucket_size//2))
        xR=tf.matmul(query,R)
        concat_xR=tf.concat([xR,-xR],axis=-1)
        buckets=tf.math.argmax(concat_xR,axis=-1)

        sticker=tf.argsort(buckets)
        undo_sort=tf.argsort(sticker)
        sorted_query=tf.gather(query,sticker,axis=1,batch_dims=1)
        sorted_value=tf.gather(value,sticker,axis=1,batch_dims=1)

        chunked_query=tf.stack(tf.split(sorted_query,self.bucket_size,1),1)
        chunked_value=tf.stack(tf.split(sorted_value,self.bucket_size,1),1)

        sticker=tf.stack(tf.split(sticker,self.bucket_size,1),1)
        new_sticker=sticker_look_one_back(sticker)

        lb_chunked_query=look_one_back(chunked_query)
        lb_chunked_value=look_one_back(chunked_value)

        score=tf.einsum('bhie,bhje->bhij',chunked_query,lb_chunked_query)
        score/=tf.math.sqrt(tf.cast(query.shape[-1],tf.float32))

        if causal_masking==True:
            causal_mask=causal_masker(sticker,new_sticker)
            dots+=causal_mask*-1e-10
        score=tf.nn.softmax(score)
        output=tf.einsum('buij,buje->buie',score,lb_chunked_value)

        sorted_output=tf.reshape(output,(tf.shape(output)[0],tf.shape(query)[i],output.shape[3]))
        output=tf.gather(sorted_output,undo_sort,axis=1,batch_dims=1)
        return output

This code defines a custom layer `LSHAttention` which implements Locality Sensitive Hashing (LSH) for attention computation. Let's break down the components and operations within this layer:

### Components Explained:

1. **Initialization**:
   - The `LSHAttention` layer is initialized with parameters `bucket_size` and `n_hashes`.
     - `bucket_size`: Number of buckets for hashing.
     - `n_hashes`: Number of hashing functions to use.

2. **Random Projections and Hashing**:
   - Random projections (`R`) are generated to transform the `query` tensor (`query`) into a lower-dimensional space (`(batch_size, sequence_length, bucket_size/2)`).
   - The `query` tensor is multiplied with `R` and concatenated with its negative counterpart to create `concat_xR`.
   - Bucket assignments (`buckets`) are obtained by finding the index of the maximum value along the last dimension of `concat_xR`.

3. **Sorting and Chunking**:
   - The bucket assignments (`buckets`) are sorted to rearrange the `query` and `value` tensors accordingly (`sorted_query` and `sorted_value`).
   - The sorted tensors are chunked into smaller segments (`chunked_query` and `chunked_value`) based on `bucket_size`.

4. **Applying Look-One-Back**:
   - The `sticker_look_one_back` function is applied to reorder the bucket assignments (`sticker`) for causal masking purposes.

5. **Calculating Scores and Softmax**:
   - Dot products (`score`) are computed between the chunked `query` tensors and their look-one-back counterparts.
   - Scores are divided by the square root of the query dimension for scaling (`tf.math.sqrt(tf.cast(query.shape[-1], tf.float32))`).
   - Softmax is applied along the last dimension of `score` to compute attention weights (`score=tf.nn.softmax(score)`).

6. **Masking (Optional)**:
   - If `causal_masking` is enabled (`True`), a causal mask is computed using the `causal_masker` function and applied to the attention scores (`score += causal_mask * -1e-10`).

7. **Weighted Sum**:
   - Weighted sum (`output`) is computed by performing matrix multiplication (`tf.einsum`) between the attention weights (`score`) and the chunked `value` tensors (`lb_chunked_value`).

8. **Reordering and Output**:
   - Reordering and reshaping are applied to the `output` tensor to restore the original order of elements (`sorted_output`).
   - The sorted `output` tensor is restored to its original order using `gather` operations (`output=tf.gather(sorted_output, undo_sort, axis=1, batch_dims=1)`).

### Usage Example:

To use the `LSHAttention` layer in a Transformer model, you can instantiate an instance of `LSHAttention` and call it with appropriate inputs (`query`, `key`, `value`) along with optional `causal_masking`.

```python
import tensorflow as tf

# Define input tensors (example shapes)
batch_size = 32
sequence_length = 10
embedding_dim = 128

query = tf.random.normal((batch_size, sequence_length, embedding_dim))
key = tf.random.normal((batch_size, sequence_length, embedding_dim))
value = tf.random.normal((batch_size, sequence_length, embedding_dim))

# Instantiate LSHAttention layer
lsh_attention = LSHAttention(bucket_size=8, n_hashes=1)

# Apply LSHAttention layer
attention_output = lsh_attention(query, key, value, causal_masking=True)

# Display the shape of the output
print(attention_output.shape)
```

In this example:
- We generate random input tensors (`query`, `key`, `value`) with specified shapes.
- We instantiate an `LSHAttention` layer (`lsh_attention`) with `bucket_size=8` and `n_hashes=1`.
- The `LSHAttention` layer is applied to the input tensors (`query`, `key`, `value`) with optional `causal_masking=True`.
- The resulting `attention_output` tensor represents the output of the attention mechanism applied by the `LSHAttention` layer.

#Training

In [None]:
checkpoint_filepath = '/content/drive/MyDrive/NLP Repository/Projects/Sentiments_analysis/transformer.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
transformer.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history=transformer.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    callbacks=[model_checkpoint_callback])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

#EValuation

In [None]:
transformer.load_weights(checkpoint_filepath)

In [None]:
test_dataset=test_ds.map(vectorizer)
test_dataset=test_dataset.batch(BATCH_SIZE)
transformer.evaluate(test_dataset)

#Testing

In [None]:
test_data=tf.data.Dataset.from_tensor_slices([["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! "],
                                              ["very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, so in all i didn't like this movie "],])


In [None]:
def vectorizer_test(review):
    return vectorize_layer(review)
test_dataset=test_data.map(vectorizer_test)

In [None]:
transformer.predict(test_dataset)