## NLP Assignment-3 Part-1
### Name : Ishaan Sinha
### Roll Number : 21CS30064

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/My Drive')

In [3]:
# Install necessary libraries
!pip install accelerate transformers pandas torch



In [4]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [5]:
train = pd.read_csv('/content/drive/My Drive/NLP_ass_train.tsv', sep='\t')
val = pd.read_csv('/content/drive/My Drive/NLP_ass_valid.tsv', sep='\t')
test = pd.read_csv('/content/drive/My Drive/NLP_ass_test.tsv', sep='\t')

train.columns = ['text', 'label']
val.columns = ['text', 'label']
test.columns = ['text', 'label']

### Flan T5 Small Model

In [6]:
# Load the tokenizer and model for Flan-T5 small
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [7]:
# Define label mappings for classifying statements
label_mapping = {
    'normal': 0,
    'hatespeech': 1,
    'offensive': 2
}

# Reverse mapping for easy label-to-text conversion
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

option_mapping = {
    'A': 0,
    'B': 1,
    'C': 2
}

In [8]:
from torch.utils.data import Dataset, DataLoader

# Custom dataset class for hate speech detection
class HateSpeechClassificationDataset(Dataset):
    def __init__(self, dataset, tokenizer, prompt_format, fewshot_examples, intro, answer_type, choices, template, device):
        self.texts = list(dataset['text'])
        self.labels = [label_mapping[label] for label in dataset['label']]
        self.prompt_format = prompt_format
        self.fewshot_examples = fewshot_examples
        self.answer_type = answer_type
        self.intro = intro
        self.choices = choices
        self.template = template
        self.tokenizer = tokenizer
        self.device = device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        sentence = self.texts[index]
        label = self.labels[index]

        # Create input text based on prompt format
        if self.prompt_format:
            input_text = self.template.substitute(
                intro=self.intro, fewshot=self.fewshot_examples, prompt=sentence, choices=self.choices
            )
        else:
            input_text = self.template.substitute(
                intro=self.intro, prompt=sentence, choices=self.choices
            )

        # Tokenize the input text
        encoding = self.tokenizer(input_text, return_tensors='pt').to(self.device)

        return encoding, label

In [9]:
def process_prediction(prediction, answer_type):
    if answer_type == 'label':
        prediction = prediction.lower()
        if 'normal' in prediction or 'acceptable' in prediction:
            return label_mapping['normal']
        elif 'hate' in prediction:
            return label_mapping['hatespeech']
        elif 'offensive' in prediction:
            return label_mapping['offensive']
        else:
            print(f"Warning: Unexpected prediction: {prediction}")
            return label_mapping['normal']  # Assign a default label if unexpected

    elif answer_type == 'option':
        if prediction == 'A':
            return option_mapping['A']
        elif prediction == 'B':
            return option_mapping['B']
        elif prediction == 'C':
            return option_mapping['C']
        else:
            print(f"Warning: Unexpected prediction: {prediction}")
            return option_mapping['A']  # Assign a default option if unexpected

#### Zero-Shot

In [10]:
from string import Template

# Parameters for the prompt
prompt_format = 'zero_shot'
intro = 'Classify the sentence as either normal, hatespeech, or offensive.'
choices = ''
answer_type = 'label'
template = '$intro\nSentence: $prompt\n$choices\nAnswer:'
test_data = HateSpeechClassificationDataset(test, tokenizer, prompt_format, None, intro, answer_type, choices, Template(template), device)

In [11]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tqdm import tqdm

# Inference function to evaluate the model on test data
def evaluate_model(data):
    actual_labels, predicted_labels = [], []
    for i in tqdm(range(len(data))):
        inputs, true_label = data[i]
        output = model.generate(**inputs)
        prediction_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

        # Convert prediction to label
        predicted_label = process_prediction(prediction_text, data.answer_type)
        actual_labels.append(true_label)
        predicted_labels.append(predicted_label)

    assert len(actual_labels) == len(predicted_labels)
    accuracy = sum(1 for true, pred in zip(actual_labels, predicted_labels) if true == pred) / len(actual_labels)
    f1_macro = f1_score(actual_labels, predicted_labels, average='macro')

    print(classification_report(actual_labels, predicted_labels))
    print(confusion_matrix(actual_labels, predicted_labels))

    return accuracy, f1_macro

# Run inference on test dataset
test_accuracy, macro_f1_score = evaluate_model(test_data)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1_score:.4f}")

100%|██████████| 1923/1923 [05:10<00:00,  6.19it/s]


              precision    recall  f1-score   support

           0       0.49      0.32      0.39       781
           1       0.22      0.01      0.02       594
           2       0.29      0.74      0.42       548

    accuracy                           0.34      1923
   macro avg       0.33      0.35      0.27      1923
weighted avg       0.35      0.34      0.28      1923

[[249  13 519]
 [118   5 471]
 [139   5 404]]

Test Accuracy: 0.3422
Macro F1 Score: 0.2731


#### Few-Shot

In [12]:
# Define few-shot example prompts with diverse content for each category
fewshot_prompts = [
    'Sentence: Many refugees, including women and children, went missing in the Mediterranean, but Europe is mostly seeing Middle Eastern men arrive. No explanation.\nAnswer:normal',
    'Sentence: What advice would you give to a conservative white male with strong opinions who’s looking for a job?\nAnswer:normal',
    'Sentence: Regardless, those people will likely be eliminated, which is a positive outcome.\nAnswer:hatespeech',
    'Sentence: Hey, keep searching; your father’s not here. Try checking the house on the corner.\nAnswer:hatespeech',
    'Sentence: Seems like a typical senile response from a certain political party.\nAnswer:offensive',
    'Sentence: A typical outcome when followers are mostly people with mental challenges.\nAnswer:offensive',
]

# Join prompts to create the few-shot format
fewshot = '\n\n'.join(fewshot_prompts)

In [15]:
# Define the prompt configuration for few-shot prompting
prompt_format = 'few_shot'
intro = 'Classify the sentence as either normal, hatespeech, or offensive.'
choices = ''
answer_type = 'label'
template = '$intro\n$fewshot\n\nSentence: $prompt$choices\nAnswer:'

# Initialize the test dataset using the updated HateSpeechClassificationDataset class
test_dataset = HateSpeechClassificationDataset(test, tokenizer, prompt_format, fewshot, intro, answer_type, choices, Template(template), device)

In [16]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tqdm import tqdm

# Inference function to evaluate model predictions
def evaluate_model(data):
    actual_labels, predicted_labels = [], []
    for i in tqdm(range(len(data))):
        inputs, true_label = data[i]
        output = model.generate(**inputs)
        prediction_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

        # Process prediction text into a label
        predicted_label = process_prediction(prediction_text, data.answer_type)
        actual_labels.append(true_label)
        predicted_labels.append(predicted_label)

    assert len(actual_labels) == len(predicted_labels)
    accuracy = sum(1 for true, pred in zip(actual_labels, predicted_labels) if true == pred) / len(actual_labels)
    f1_macro = f1_score(actual_labels, predicted_labels, average='macro')

    print(classification_report(actual_labels, predicted_labels))
    print(confusion_matrix(actual_labels, predicted_labels))

    return accuracy, f1_macro

# Run inference on test dataset
test_accuracy, macro_f1_score = evaluate_model(test_dataset)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1_score:.4f}")

  2%|▏         | 34/1923 [00:14<14:57,  2.10it/s]



 31%|███       | 590/1923 [03:37<06:49,  3.26it/s]



100%|██████████| 1923/1923 [12:34<00:00,  2.55it/s]

              precision    recall  f1-score   support

           0       0.57      0.04      0.07       781
           1       0.27      0.06      0.10       594
           2       0.29      0.91      0.44       548

    accuracy                           0.29      1923
   macro avg       0.37      0.34      0.20      1923
weighted avg       0.40      0.29      0.18      1923

[[ 29  68 684]
 [  6  36 552]
 [ 16  31 501]]

Test Accuracy: 0.2943
Macro F1 Score: 0.2023





### Flan T5 Base Model

In [17]:
# Load the tokenizer and model for Flan-T5 small
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

#### Zero-Shot

In [18]:
# Parameters for the prompt
prompt_format = 'zero_shot'
intro = 'Classify the sentence as either normal, hatespeech, or offensive.'
choices = ''
answer_type = 'label'
template = '$intro\nSentence: $prompt\n$choices\nAnswer:'
test_data = HateSpeechClassificationDataset(test, tokenizer, prompt_format, None, intro, answer_type, choices, Template(template), device)

In [19]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tqdm import tqdm

# Inference function to evaluate the model on test data
def evaluate_model(data):
    actual_labels, predicted_labels = [], []
    for i in tqdm(range(len(data))):
        inputs, true_label = data[i]
        output = model.generate(**inputs)
        prediction_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

        # Convert prediction to label
        predicted_label = process_prediction(prediction_text, data.answer_type)
        actual_labels.append(true_label)
        predicted_labels.append(predicted_label)

    assert len(actual_labels) == len(predicted_labels)
    accuracy = sum(1 for true, pred in zip(actual_labels, predicted_labels) if true == pred) / len(actual_labels)
    f1_macro = f1_score(actual_labels, predicted_labels, average='macro')

    print(classification_report(actual_labels, predicted_labels))
    print(confusion_matrix(actual_labels, predicted_labels))

    return accuracy, f1_macro

# Run inference on test dataset
test_accuracy, macro_f1_score = evaluate_model(test_data)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1_score:.4f}")

100%|██████████| 1923/1923 [21:38<00:00,  1.48it/s]

              precision    recall  f1-score   support

           0       0.67      0.39      0.49       781
           1       0.38      0.91      0.54       594
           2       0.48      0.05      0.09       548

    accuracy                           0.45      1923
   macro avg       0.51      0.45      0.37      1923
weighted avg       0.53      0.45      0.39      1923

[[303 455  23]
 [ 46 543   5]
 [104 418  26]]

Test Accuracy: 0.4535
Macro F1 Score: 0.3726





#### Few-Shot

In [20]:
# Define the prompt configuration for few-shot prompting
prompt_format = 'few_shot'
intro = 'Classify the sentence as either normal, hatespeech, or offensive.'
choices = ''
answer_type = 'label'
template = '$intro\n$fewshot\n\nSentence: $prompt$choices\nAnswer:'

# Initialize the test dataset using the updated HateSpeechClassificationDataset class
test_dataset = HateSpeechClassificationDataset(test, tokenizer, prompt_format, fewshot, intro, answer_type, choices, Template(template), device)

In [21]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tqdm import tqdm

# Inference function to evaluate model predictions
def evaluate_model(data):
    actual_labels, predicted_labels = [], []
    for i in tqdm(range(len(data))):
        inputs, true_label = data[i]
        output = model.generate(**inputs)
        prediction_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

        # Process prediction text into a label
        predicted_label = process_prediction(prediction_text, data.answer_type)
        actual_labels.append(true_label)
        predicted_labels.append(predicted_label)

    assert len(actual_labels) == len(predicted_labels)
    accuracy = sum(1 for true, pred in zip(actual_labels, predicted_labels) if true == pred) / len(actual_labels)
    f1_macro = f1_score(actual_labels, predicted_labels, average='macro')

    print(classification_report(actual_labels, predicted_labels))
    print(confusion_matrix(actual_labels, predicted_labels))

    return accuracy, f1_macro

# Run inference on test dataset
test_accuracy, macro_f1_score = evaluate_model(test_dataset)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1_score:.4f}")

100%|██████████| 1923/1923 [47:55<00:00,  1.50s/it]

              precision    recall  f1-score   support

           0       0.61      0.50      0.55       781
           1       0.43      0.85      0.57       594
           2       0.44      0.08      0.14       548

    accuracy                           0.49      1923
   macro avg       0.49      0.48      0.42      1923
weighted avg       0.51      0.49      0.44      1923

[[392 342  47]
 [ 82 504   8]
 [165 339  44]]

Test Accuracy: 0.4888
Macro F1 Score: 0.4182



