#Installation

In [None]:
!pip install transformers datasets

#Import statements

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer)

In [None]:
BATCH_SIZE=8

#Data Preparation

In [None]:
dataset_id='imdb'
dataset = load_dataset(dataset_id)

In [None]:
dataset

In [None]:
dataset['train'][0]

##Bert Model

The main difference between BERT models with "uncased" and "cased" variants lies in how they handle the casing of text during pre-training and inference. Here's a breakdown of the differences between `bert-uncased` and `bert-cased` models from Hugging Face:

### BERT-Uncased (`bert-base-uncased`):

- **Lowercase Text**:
  - The `bert-base-uncased` model is trained on uncased (lowercase) text during pre-training.
  - All text, including both input data and vocabulary, is converted to lowercase before being processed by the model.
  - Example: "Hello, World!" → "hello, world!"

- **Vocabulary**:
  - The vocabulary of the `bert-base-uncased` model consists of lowercase words only.
  - This simplifies tokenization and reduces the number of unique tokens by treating uppercase and lowercase versions of the same word as identical.

- **Usage**:
  - Use `bert-base-uncased` for tasks where the distinction between uppercase and lowercase letters is not crucial, such as general-purpose text processing or sentiment analysis.

### BERT-Cased (`bert-base-cased`):

- **Preserves Case**:
  - The `bert-base-cased` model preserves the case (uppercase and lowercase) of text during pre-training and inference.
  - Text is tokenized and processed without converting to lowercase, retaining the original casing of words.
  - Example: "Hello, World!" → "Hello, World!"

- **Vocabulary**:
  - The vocabulary of the `bert-base-cased` model includes both lowercase and uppercase versions of words.
  - This allows the model to differentiate between words based on their casing, capturing nuances related to proper nouns and acronyms.

- **Usage**:
  - Use `bert-base-cased` for tasks where the distinction between uppercase and lowercase letters is important, such as named entity recognition (NER) or tasks involving proper nouns and specific text formatting.

### Choosing Between BERT-Uncased and BERT-Cased:

- **Task Requirements**:
  - Consider the nature of your NLP task and whether it requires sensitivity to letter casing.
  - Tasks involving proper nouns, named entities, or specific text formatting may benefit from `bert-base-cased`.
  - For general-purpose tasks or scenarios where case sensitivity is less critical, `bert-base-uncased` can be a suitable choice.

- **Model Performance**:
  - Evaluate both variants on your specific task to determine which model performs better based on accuracy, precision, and other metrics.
  - Sometimes, the choice between cased and uncased models can impact model performance depending on the characteristics of the dataset.

- **Pre-trained Model Selection**:
  - Hugging Face offers both `bert-base-uncased` and `bert-base-cased` models pre-trained on large corpora.
  - Choose the pre-trained model that aligns with your task requirements and data characteristics.

In summary, the choice between `bert-base-uncased` and `bert-base-cased` depends on whether your NLP task benefits from case sensitivity and how text casing impacts the performance of your models.

In [None]:
model_id="bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

In [None]:
tokenizer.is_fast

In [None]:
test_input_1='The Weather of Today is Gréat! zwp'
test_input_2='How are you doing?'
inputs=[test_input_1,test_input_2]

tokenizer.tokenize(inputs,)

In [None]:
output=tokenizer(inputs,padding=True,truncation=True,max_length=128)
print(output)

In [None]:
tokenizer.decode(output['input_ids'][0])

In [None]:
tokenizer.decode(output['input_ids'][1])

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],padding=True,truncation=True,)

The `preprocess_function` is a Python function designed to preprocess text examples using a tokenizer, typically for input into a natural language processing (NLP) model such as BERT. This function applies tokenization, padding, and truncation to the text data. Let's break down the components and explain how this function works:

### Explanation of `preprocess_function`:

```python
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding=True,
        truncation=True
    )
```

- **Input**:
  - `examples`: This parameter represents a batch of input examples, where each example contains a `"text"` field that needs preprocessing.

- **Tokenization**:
  - `tokenizer(examples["text"])`: The function uses a tokenizer (assumed to be defined elsewhere) to tokenize the text data.
  - The tokenizer breaks down each input text into tokens according to its vocabulary and tokenization rules.

- **Padding**:
  - `padding=True`: Enables padding of tokenized sequences to ensure uniform length across examples in a batch.
  - Padding is often necessary for batching sequences and is achieved by appending special tokens (like `[PAD]`) to shorter sequences.

- **Truncation**:
  - `truncation=True`: Enables truncation of tokenized sequences to a specified maximum length.
  - Truncation is used to limit the length of tokenized sequences, ensuring they fit within the model's input size constraints.

### Usage Scenario:

- **Batch Processing**:
  - The `preprocess_function` is typically used in conjunction with a dataset or data pipeline (e.g., using Hugging Face Datasets library or TensorFlow `Dataset.map()` method).
  - It processes batches of input examples, where each example contains text data to be tokenized, padded, and possibly truncated.

- **Integration with Tokenizer**:
  - The function assumes the existence of a `tokenizer` object initialized elsewhere.
  - The `tokenizer` performs the actual tokenization process, converting input text into tokens recognizable by the NLP model.

- **Batch Output**:
  - The output of `preprocess_function` is a dictionary or structure suitable for input to an NLP model.
  - Each example in the batch is transformed into tokenized sequences with consistent length (due to padding) and possibly truncated to fit model input requirements.

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    #collate_fn=data_collator
)


### Code Explanation:

```python
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE
)
```

- **`to_tf_dataset()` Method**:
  - The `to_tf_dataset()` method is used to convert a Hugging Face `Dataset` object (`tokenized_dataset["train"]`) into a TensorFlow dataset (`tf.data.Dataset`).

- **Dataset Columns**:
  - `columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']`: Specifies the dataset columns to include in the TensorFlow dataset.
    - `input_ids`: Contains the token IDs representing input text sequences.
    - `token_type_ids`: Represents the segment IDs (e.g., for sentence pair classification tasks).
    - `attention_mask`: Indicates which tokens should be attended to (1 for tokens, 0 for padding).
    - `label`: Contains the target labels associated with the input sequences (e.g., sentiment labels).

- **Shuffling and Batch Size**:
  - `shuffle=True`: Enables shuffling of the dataset to randomize the order of examples during training.
  - `batch_size=BATCH_SIZE`: Sets the batch size for the TensorFlow dataset, determining the number of examples processed together in each training step.




In [None]:
tf_val_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    #collate_fn=data_collator
)

In [None]:
def swap_positions(dataset):
  return {'input_ids':dataset['input_ids'],
          'token_type_ids':dataset['token_type_ids'],
          'attention_mask':dataset['attention_mask'],},dataset['label']

The `swap_positions` function is designed to transform a dataset dictionary into a format suitable for training or inference with a machine learning model. This function appears to modify the structure of the dataset by rearranging or extracting specific components.
### Code Explanation:

```python
def swap_positions(dataset):
    return {
        'input_ids': dataset['input_ids'],
        'token_type_ids': dataset['token_type_ids'],
        'attention_mask': dataset['attention_mask'],
    }, dataset['label']
```

- **Input**:
  - `dataset`: This parameter represents a single example from the dataset, typically in dictionary format containing input features (`'input_ids'`, `'token_type_ids'`, `'attention_mask'`) and corresponding labels (`'label'`).

- **Output**:
  - The function returns a tuple containing two elements:
    1. A dictionary (`features_dict`) containing input features (`'input_ids'`, `'token_type_ids'`, `'attention_mask'`).
    2. The corresponding label (`'label'`).

- **Dictionary Structure**:
  - The `features_dict` contains specific keys (`'input_ids'`, `'token_type_ids'`, `'attention_mask'`) extracted from the input `dataset`.
  - Each key corresponds to a specific input feature required for the machine learning model.

- **Label Extraction**:
  - The label (`'label'`) is extracted directly from the input `dataset` and returned as the second element of the tuple.

### Usage Scenario:

- **Dataset Processing**:
  - Use `swap_positions` to process individual examples from a dataset, preparing them for model training or inference.
  - The function extracts necessary input features (`'input_ids'`, `'token_type_ids'`, `'attention_mask'`) and organizes them into a dictionary (`features_dict`).

- **Model Input Format**:
  - This function is useful for adapting dataset structures to match the expected input format of machine learning models.
  - For example, the output tuple can directly serve as input to a TensorFlow model with the appropriate keys (`'input_ids'`, `'token_type_ids'`, `'attention_mask'`) and label (`'label'`).

### Example Usage:

```python
# Assuming `dataset` is a single example from a tokenized dataset
example = {
    'input_ids': [101, 2023, 2003, 1037, 2143, 1029],
    'token_type_ids': [0, 0, 0, 0, 0, 0],
    'attention_mask': [1, 1, 1, 1, 1, 1],
    'label': 1
}

# Apply `swap_positions` function to process the example
processed_example = swap_positions(example)

# Processed example output
print(processed_example)
# Output: ({'input_ids': [101, 2023, 2003, 1037, 2143, 1029], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}, 1)
```

In this example, `swap_positions` processes a single example (`example`) by extracting input features (`'input_ids'`, `'token_type_ids'`, `'attention_mask'`) and the corresponding label (`'label'`). The processed example (`processed_example`) is then ready for use as input to a machine learning model.

In [None]:
tf_train_dataset=tf_train_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)
tf_val_dataset=tf_val_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)

In [None]:
for i in tf_train_dataset.take(1):
  print(i)

In [None]:
tf_val_dataset

##Data Preparation for Roberta Model

In [None]:
model_id="roberta-base"
tokenizer=RobertaTokenizerFast.from_pretrained(model_id)

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],padding=True,truncation=True,)

In [None]:
tokenized_dataset = dataset.map(preprocess_function,)# batched=True)

In [None]:
tokenized_dataset['train'][0]

In [None]:
tokenized_dataset

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids','attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    #collate_fn=data_collator
)

In [None]:
tf_val_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids','attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    #collate_fn=data_collator
)

In [None]:
def swap_positions(dataset):
  return {'input_ids':dataset['input_ids'],
          'attention_mask':dataset['attention_mask'],},dataset['label']

In [None]:
tf_train_dataset=tf_train_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)
tf_val_dataset=tf_val_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)

In [None]:
for i in tf_train_dataset.take(1):
  print(i)

##Data Preparation for XtremeDistill Model

In [None]:
model_id="microsoft/xtremedistil-l6-h256-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

In [None]:
tokenizer.is_fast

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],max_length=512,padding=True,truncation=True,)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    #collate_fn=data_collator
)

In [None]:
tf_val_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    #collate_fn=data_collator
)

In [None]:
def swap_positions(dataset):
  return {'input_ids':dataset['input_ids'],
          'token_type_ids':dataset['token_type_ids'],
          'attention_mask':dataset['attention_mask'],},dataset['label']

In [None]:
tf_train_dataset=tf_train_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)
tf_val_dataset=tf_val_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)

In [None]:
for i in tf_val_dataset.take(1):
  print(i)

In [None]:
tf_val_dataset

#Modeling


##Model Building With TFBertForSequenceClassification

In [None]:
model=TFBertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=1)
model.summary()

##Model Building With XtremeDistillForSequenceClassification

In [None]:
model=TFBertForSequenceClassification.from_pretrained(model_id,num_labels=2)
model.summary()

##TFBertModel

In [None]:
model=TFBertModel.from_pretrained("bert-base-uncased")
model.summary()

In [None]:
input_ids=Input(shape = (512,),dtype=tf.int64,name='input_ids')
token_type_ids=Input(shape = (512,),dtype=tf.int64,name='token_type_ids')
attention_mask=Input(shape = (512,),dtype=tf.int64,name='attention_mask')

x = model([input_ids,token_type_ids,attention_mask])
print(x)
x=Dense(128,activation='relu')(x[0][:,0,:])
output=Dense(1,activation='sigmoid',name='label')(x)

custom_bert = tf.keras.Model(inputs=[input_ids,token_type_ids,attention_mask], outputs=output)

In [None]:
custom_bert.summary()

## Modling with TFRobertaForSequenceClassification

In [None]:
model=TFRobertaForSequenceClassification.from_pretrained(model_id,num_labels=2)
model.summary()

#Training

In [None]:
num_epochs = 3
batches_per_epoch = len(tokenized_dataset["train"]) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * num_epochs)

In [None]:
optimizer, schedule = create_optimizer(init_lr=2e-5,num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=optimizer,
    metrics=['accuracy'],)
    #run_eagerly=True)

In [None]:
history=model.fit(
    tf_train_dataset.take(1000),
    validation_data=tf_val_dataset,
    epochs=3,)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

#Testing

In [None]:
inputs = tokenizer(["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! ",
                    "very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, but later on towards the middle of the movie, my favorite character got in and he did a great job, so over "], padding=True,return_tensors="tf")

logits = model(**inputs).logits
print(logits)


#Conversion to Onnx Format

##Installation

In [None]:
!pip install -U tf2onnx
!pip install onnxruntime

In [None]:
import onnxruntime as rt
import tf2onnx
rt.get_device()

##From Keras Model

In [None]:
output_path = "/content/drive/MyDrive/NLP Repository/Projects/Sentiments_analysis/xtremedistill.onnx"

In [None]:
spec = [tf.TensorSpec((None,512),tf.int64, name="input_ids"),
        tf.TensorSpec((None,512),tf.int64, name="token_type_ids"),
        tf.TensorSpec((None,512),tf.int64, name="attention_mask")]

model_proto, _ = tf2onnx.convert.from_keras(
    model, input_signature=spec,
    opset=17, output_path=output_path,)
output_names = [n.name for n in model_proto.graph.output]

In [None]:
print(output_names)

##Inference

###Benchmarking Onnx

In [None]:
text=["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes!"]

# text = ["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! ",
#                     "very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, but later on towards the middle of the movie, my favorite character got in and he did a great job, so over ",
#                     "very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, but later on towards the middle of the movie, my favorite character got in and he did a great job, so overall i will give this movie a pass "]


inputs = tokenizer(text,padding='max_length',max_length=512,truncation=True,return_tensors="np")

N_PREDICTIONS = 1
print(inputs)

In [None]:
providers=['CPUExecutionProvider']
m = rt.InferenceSession(output_path, providers=providers)

t1 = time.time()
for _ in range(N_PREDICTIONS):
  onnx_pred = m.run(["logits"], {'input_ids':inputs['input_ids'],
                                'token_type_ids':inputs['token_type_ids'],
                                'attention_mask':inputs['attention_mask']})
print("Time for a single Prediction", (time.time() - t1)/N_PREDICTIONS)

In [None]:
print(onnx_pred)

###Benchmarking TF

In [None]:
t1 = time.time()
for _ in range(N_PREDICTIONS):
  logits = model(**inputs).logits
print(logits)
print("Time for a single Prediction", (time.time() - t1)/N_PREDICTIONS)

#Quantization with Onnx

In [None]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

In [None]:
model_fp32 = '/content/drive/MyDrive/NLP Repository/Projects/Sentiments_analysis/xtremedistill.onnx'
model_quant = '/content/drive/MyDrive/NLP Repository/Projects/Sentiments_analysis/xtremedistill_quantized.onnx'

In [None]:
quantized_model = quantize_dynamic(model_fp32, model_quant, weight_type = QuantType.QUInt8)

##Accuracy Drop due to Quantization

In [None]:
unbatched_val_dataset=tf_val_dataset.unbatch()

In [None]:
N_SAMPLES=1024

In [None]:
def accuracy(model):
  total=0
  for text,label in unbatched_val_dataset.take(N_SAMPLES):

    onnx_pred = model.run(["logits"], {'input_ids':[text['input_ids'].numpy()],
                                'token_type_ids':[text['token_type_ids'].numpy()],
                                'attention_mask':[text['attention_mask'].numpy()]})
    if np.argmax(onnx_pred, axis = -1)[0][0] == label.numpy():
      total+=1
  return (total/N_SAMPLES)*100

In [None]:
providers=['CPUExecutionProvider']
m = rt.InferenceSession(model_fp32, providers=providers)
m_q = rt.InferenceSession(model_quant, providers=providers)
print(accuracy(m_q))
print(accuracy(m))

#Temperature in Distillation

In [None]:
import numpy as np

In [None]:
def softmax(logits,T):
  denominator=np.sum([np.exp(i/T) for i in logits])
  return [np.exp(i/T)/denominator for i in logits]

In [None]:
logits=[10,13,17,5]


In [None]:
print("For T=1 ------>",softmax(logits,1))
print("For T=2 ------>",softmax(logits,2))
print("For T=3 ------>",softmax(logits,3))
print("For T=5 ------>",softmax(logits,5))
print("For T=10 ----->",softmax(logits,10))
print("For T=10000 -->",softmax(logits,10000))