#Installation

In [None]:
!pip install transformers datasets

#Import Statements

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import (BinaryAccuracy, FalsePositives, FalseNegatives, TruePositives,
                                       TrueNegatives, Precision, Recall, AUC, binary_accuracy,Accuracy,
                                       TopKCategoricalAccuracy, CategoricalAccuracy,SparseCategoricalAccuracy)
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer,TFDebertaForSequenceClassification,DebertaTokenizerFast)

In [None]:
BATCH_SIZE=16

#Data Preparation

In [None]:
!pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d bitext/training-dataset-for-chatbotsvirtual-assistants
!unzip "/content/training-dataset-for-chatbotsvirtual-assistants.zip" -d "/content/dataset/"

In [None]:
dataset=load_dataset("csv",
                       data_files="/content/dataset/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv")

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
intents=list(set(dataset['train']['intent']))
dict_intents={intents[i]: i for i in range(len(intents))}
print(dict_intents)
print(len(intents))

In [None]:
def preprocess(dataset):
  return {'utterance':dataset['utterance'],
          'intent':dict_intents[dataset['intent']]}

In [None]:
prep_dataset=dataset.map(preprocess)

In [None]:
prep_dataset['train'][0]

In [None]:
model_id="microsoft/deberta-base"
tokenizer = DebertaTokenizerFast.from_pretrained(model_id)

In [None]:
def tokenizer_function(dataset):
  return tokenizer(dataset["utterance"],)

In [None]:
tokenized_dataset=prep_dataset.map(tokenizer_function)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][0]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

The `DataCollatorWithPadding` class from the Hugging Face `transformers` library is designed to facilitate batch creation and padding for tokenized datasets. This class is particularly useful for preparing batches of tokenized inputs suitable for model training. Let's break down the provided usage of `DataCollatorWithPadding`:

### Code Explanation:

```python
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
```

- **Initialization**:
  - `DataCollatorWithPadding` is initialized with the following parameters:
    - `tokenizer`: This parameter is a tokenizer object from Hugging Face Transformers (`tokenizer=tokenizer`).
      - The tokenizer is used to handle tokenization and padding of input sequences.
    - `return_tensors="tf"`: Specifies the format of the returned tensors.
      - `return_tensors="tf"` indicates that the returned batch should be in TensorFlow `tf.Tensor` format.


In [None]:
tf_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids','attention_mask', 'intent'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

In [None]:
def swap_positions(dataset):
  return {'input_ids':dataset['input_ids'],
          'attention_mask':dataset['attention_mask'],},dataset['intent']

In [None]:
tf_dataset=tf_dataset.map(swap_positions)

In [None]:
train_dataset=tf_dataset.take(int(0.9*len(tf_dataset)))
val_dataset=tf_dataset.skip(int(0.9*len(tf_dataset)))

In [None]:
for i in val_dataset.take(1):
  print(i)

#Modeling

##With TFDebertaForSequenceClassification

In [None]:
model=TFDebertaForSequenceClassification.from_pretrained(model_id,num_labels=len(intents))
model.summary()

#Training

In [None]:
num_epochs = 2
batches_per_epoch = len(tokenized_dataset["train"]) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * num_epochs)

In [None]:
optimizer, schedule = create_optimizer(init_lr=2e-5,num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model.compile(optimizer=optimizer,
    metrics=["accuracy"])

In [None]:
history=model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2,)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

#Evaluation

In [None]:
predicted = []
labels = []

for input, label in val_dataset:
  predicted.append(model(**input).logits)
  labels.append(label.numpy())

Performing inference using a trained model (`model`) on a validation dataset (`val_dataset`) to obtain predictions and compare them with the ground truth labels.

### Code Explanation:

1. **Iteration over Validation Dataset**:
   - Iterate over the validation dataset (`val_dataset`) to obtain batches of inputs (`input`) and corresponding labels (`label`).

```python
predicted = []
labels = []

for inputs, labels_batch in val_dataset:
    # Process each batch of inputs and labels
    logits = model(inputs)  # Assuming model directly outputs logits
    predicted.append(logits)
    labels.append(labels_batch.numpy())  # Convert labels to numpy array for easier handling
```

2. **Model Prediction**:
   - Use the trained model (`model`) to obtain predictions (`logits`) for each batch of inputs (`inputs`).
   - Append the predictions (`logits`) to the `predicted` list.
   - Extract the labels (`labels_batch`) from the dataset batch and convert them to a numpy array using `.numpy()` for comparison.

3. **Data Handling**:
   - Store the model predictions (`predicted`) and ground truth labels (`labels`) for further evaluation.



In [None]:
print(predicted)
print(labels)

In [None]:
print(tf.argmax(predicted[:-1],axis=-1).numpy())
print(labels[:-1])

In [None]:
print(np.concatenate([np.array(labels[:-1]).flatten(),np.array(labels[-1]).flatten()]))
print(np.concatenate([np.argmax(predicted[:-1], axis = -1).flatten(), np.argmax(predicted[-1], axis = -1).flatten()]))



### Code Explanation:

1. **Concatenation and Flattening**:
   - Use `np.concatenate` to combine the lists of arrays (`labels[:-1]`) and the last array (`labels[-1]`) into a single numpy array.
   - Similarly, concatenate the lists of arrays of predicted labels (`predicted[:-1]`) and the last array of predicted labels (`predicted[-1]`).
   - Flatten the resulting concatenated arrays to obtain a 1D array of labels and predicted labels.

### Revised Code:

```python
import numpy as np

# Concatenate and flatten the ground truth labels (labels) for the entire validation dataset
true_labels_concatenated = np.concatenate([np.array(labels[:-1]).flatten(), np.array(labels[-1]).flatten()])

# Concatenate and flatten the predicted labels (predicted) for the entire validation dataset
predicted_labels_concatenated = np.concatenate([np.argmax(predicted[:-1], axis=-1).flatten(), np.argmax(predicted[-1], axis=-1).flatten()])

# Print the concatenated and flattened arrays for comparison
print("True Labels:")
print(true_labels_concatenated)

print("Predicted Labels:")
print(predicted_labels_concatenated)
```

### Notes:

- **Concatenation**:
  - Use `np.concatenate` to combine the lists of arrays (`labels[:-1]`) and the last array (`labels[-1]`) into a single contiguous array (`true_labels_concatenated`).
  - Similarly, concatenate the lists of arrays of predicted labels (`predicted[:-1]`) and the last array of predicted labels (`predicted[-1]`) into `predicted_labels_concatenated`.

- **Flattening**:
  - Ensure that each array within `labels` and `predicted` is flattened using `.flatten()` before concatenating to obtain a 1D array.

- **Comparison**:
  - Print the concatenated and flattened arrays (`true_labels_concatenated` and `predicted_labels_concatenated`) to compare the ground truth labels with the predicted labels for the entire validation dataset.


In [None]:
pred=np.concatenate([np.array(labels[:-1]).flatten(),np.array(labels[-1]).flatten()])
lab=np.concatenate([np.argmax(predicted[:-1], axis = -1).flatten(), np.argmax(predicted[-1], axis = -1).flatten()])


Here's a breakdown of what `labels[:-1]` and `labels[-1]` represent:

- `labels[:-1]`: This syntax retrieves all elements in the `labels` list except for the last element. It effectively slices the list from the beginning up to (but not including) the last element.
  
- `labels[-1]`: This syntax retrieves the last element in the `labels` list.

### Usage Example:

Let's assume `labels` is a list containing arrays of labels (e.g., numpy arrays of ground truth labels) corresponding to batches of data. We can demonstrate the usage of `labels[:-1]` and `labels[-1]` with a simple example:

```python
import numpy as np

# Example list of numpy arrays (simulating ground truth labels for batches)
labels = [np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7, 8, 9])]

# Access all elements except the last one (slice from beginning to last element - 1)
all_but_last = labels[:-1]
print("All but last element:", all_but_last)

# Access the last element in the list
last_element = labels[-1]
print("Last element:", last_element)
```

In this example:

- `all_but_last` contains all elements of `labels` except for the last one. It would be equivalent to `[np.array([1, 2, 3]), np.array([4, 5])]`.
  
- `last_element` contains the last element of `labels`, which is `np.array([6, 7, 8, 9])`.

### Correcting the Previous Code:

To correct the previous code snippet where `labels[:-1]` and `labels[-1]` were used, ensure that they are correctly applied within the context of your dataset and task. For example, if you're concatenating or processing batches of labels, use these syntaxes appropriately to access the desired subsets of the `labels` list.

```python
# Example usage of labels[:-1] and labels[-1] in concatenation or processing
concatenated_labels = np.concatenate([np.concatenate(labels[:-1]), np.concatenate([labels[-1]])])
print("Concatenated Labels:", concatenated_labels)
```

In [None]:
cm = confusion_matrix(lab, pred)
print(cm)
plt.figure(figsize=(16,16))

sns.heatmap(cm, annot=True,)
plt.title('Confusion matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')

#Testing

In [None]:
inputs = tokenizer(["Please how do i go about the account creation? ",
                    "After setting up my account, i feel like i need to change it. How do i go about that?",
                    "how do i know how much i need to pay?",
                    "purchased a product, which i now want to change"
                    ], padding=True,return_tensors="tf")

logits = model(**inputs).logits
outputs=tf.argmax(logits,axis=-1).numpy()

In [None]:
print(outputs)

In [None]:
reverse_dict_intents={i:intents[i] for i in range(len(intents))}
print(reverse_dict_intents)

In [None]:
for i in outputs:
  print(reverse_dict_intents[i])