# Data Loading

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('NLP_ass_train.tsv', sep='\t')
val = pd.read_csv('NLP_ass_valid.tsv', sep='\t')
test = pd.read_csv('NLP_ass_test.tsv', sep='\t')

train.columns = ['text', 'label']
val.columns = ['text', 'label']
test.columns = ['text', 'label']

In [3]:
train.head()

Unnamed: 0,text,label
0,nawt yall niggers ignoring me,normal
1,<user> i am bit confused coz chinese ppl can n...,hatespeech
2,and this is why i end up with nigger trainee d...,hatespeech
3,nogs jews and dykes how enriching,offensive
4,guilty until proven innocent unless you are a ...,hatespeech


In [4]:
test.head()

Unnamed: 0,text,label
0,laura loomer raped me while screaming at me in...,hatespeech
1,<user> what did the old lady do was she a nazi...,normal
2,as much as i appreciate a format of open discu...,normal
3,sex be so good a bitch be slow stroking and cr...,offensive
4,<user> <user> <user> with a room full of white...,normal


# Model Creation

In [5]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproje

# Model PEFT-QLoRa setup

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [7]:
model_size = 'base'

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [240]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"google/flan-t5-{model_size}", quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(f"google/flan-t5-{model_size}")
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# model.to(device)    # no need when device map used

In [241]:
model.device

device(type='cuda', index=0)

In [242]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [243]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [244]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear4bit(in_features=768, out_features=768, bias=False)
              (k): Linear4bit(in_features=768, out_features=768, bias=False)
              (v): Linear4bit(in_features=768, out_features=768, bias=False)
              (o): Linear4bit(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear4bit(in_features=768, out_features=2048, bias=Fa

In [245]:
import re

def find_attention_layers(model):
    pattern = re.compile(r'((self\.)?attention\.)|((self\.)?attn\.)', re.IGNORECASE)

    attention_layers = []
    for name, module in model.named_modules():
        if re.search(pattern, name):
            attention_layers.append(name)

    return attention_layers

attention_layers = find_attention_layers(model)
target_modules = attention_layers

In [246]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1770176 || all params: 169131968 || trainable%: 1.0466241367214506


In [247]:
model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear4bit(
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (base_layer): Linear4bit(in

# Dataset Preparation

In [224]:
label2id = {
    'normal': 0,
    'hatespeech': 1,
    'offensive': 2
}

In [225]:
import torch.nn.functional as F

def convert_one_hot(preds, num_classes):
  pred_ids = [label2id[label] for label in preds]
  one_hot_tensors = F.one_hot(torch.tensor(pred_ids), num_classes=num_classes)
  logit_tensors = torch.log10(one_hot_tensors.float()+1e-6)  # Adding a small value to avoid log(0)
  return logit_tensors

In [228]:
from torch.utils.data import Dataset, DataLoader

class HateSpeechDataset(Dataset):
  def __init__(self, data, tokenizer, num_classes, max_length, ans_length, preamble, options, template):
    self.text = list(data['text'])
    self.labels = list(data['label'])  # .apply(lambda x: label2id[x])
    self.tokenizer = tokenizer
    self.num_classes = num_classes
    self.max_length = max_length
    self.ans_length = ans_length
    self.preamble = preamble
    self.options = options
    self.template = template

  def __len__(self):
    return len(self.text)

  def __getitem__(self, idx):
    text = self.text[idx]
    label = self.labels[idx]

    input_text = self.template.substitute(
          preamble=self.preamble, prompt=text, options=self.options
      )

    encoding = self.tokenizer(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    label_encoding = self.tokenizer(
            label,
            add_special_tokens=True,
            max_length=self.ans_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    labels = torch.tensor([label2id[label]])

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label_ids': torch.tensor(label_encoding['input_ids'], dtype=torch.long),
        'labels': labels
    }

In [248]:
from string import Template

max_length = 512
ans_length = 2
preamble = 'Is the sentence normal, hatespeech or offensive? Classify the sentence into one of these types - normal, hatespeech, offensive.'
options = ''
template = Template('$preamble\nSentence: $prompt$options\nAnswer:')
batch_size = 32
num_classes = 3

train_dataset = HateSpeechDataset(train, tokenizer, num_classes, max_length, ans_length, preamble, options, template)
val_dataset = HateSpeechDataset(val, tokenizer, num_classes, max_length, ans_length, preamble, options, template)
test_dataset = HateSpeechDataset(test, tokenizer, num_classes, max_length, ans_length, preamble, options, template)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Model Preparation

In [249]:
import torch.nn as nn
import torch.nn.functional as F

class FlanT5Model(nn.Module):
  def __init__(self, model, num_classes):
    super(FlanT5Model, self).__init__()
    self.model = model
    self.num_classes = num_classes

  def forward(self, input_ids, attention_mask, labels):
    outputs = self.model(
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  labels=labels
                )

    logits = outputs['logits']
    preds__ = torch.argmax(logits, dim=2)
    bsize = preds__.shape[0]
    preds_ = [tokenizer.decode(preds__[i], skip_special_tokens=True) for i in range(bsize)]
    preds = postprocess(preds_)
    preds_logits = convert_one_hot(preds, num_classes)
    return torch.tensor(preds_logits, dtype=torch.float)

In [250]:
peft_model = FlanT5Model(model, num_classes)

In [251]:
lr = 2e-5
epochs = 10

In [252]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# scaler = torch.cuda.amp.GradScaler()
optimizer = optim.Adam(peft_model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

In [253]:
def postprocess(preds):
  filtered_preds = []
  for pred in preds:
    if 'hate' in pred:
      filtered_preds.append('hatespeech')
    elif 'norm' in pred:
      filtered_preds.append('normal')
    elif 'off' in pred:
      filtered_preds.append('offensive')
    else:  # for ambiguous category, set to normal
      filtered_preds.append('normal')

  assert len(filtered_preds) == len(preds)
  return filtered_preds

# Model Training

In [None]:
from tqdm import tqdm

prev_val_acc = -1
patience = 3
k = 0

for epoch in range(epochs):
    train_loss = 0.0
    print(f"Epoch {epoch+1}/{epochs}")
    peft_model.train()

    for batch in tqdm(train_dataloader, desc="Training batches"):
      input_ids = batch["input_ids"].squeeze(1).to(device)
      attention_mask = batch["attention_mask"].squeeze(1).to(device)
      label_ids = batch["label_ids"].squeeze(1).to(device)
      labels = batch["labels"].squeeze(1).to(device)

      optimizer.zero_grad()
      with torch.cuda.amp.autocast():
        output_logits = peft_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=label_ids
                  ).to(device)

        loss = loss_fn(output_logits, labels)

      # print(loss)
      loss.requires_grad = True
      loss.backward()
      optimizer.step()
      # scaler.update()

      train_loss += loss.item()

    train_loss /= len(train_dataloader)
    model.eval()

    val_loss = 0.0
    val_samples = 0
    val_correct = 0

    with torch.no_grad():
      for batch_idx, batch in tqdm(enumerate(val_dataloader)):
          input_ids = batch["input_ids"].squeeze(1).to(device)
          attention_mask = batch["attention_mask"].squeeze(1).to(device)
          label_ids = batch["label_ids"].squeeze(1).to(device)
          labels = batch["labels"].squeeze(1).to(device)

          with torch.cuda.amp.autocast():
            output_logits = peft_model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=label_ids
                      ).to(device)

            loss = loss_fn(output_logits, labels)

          # labels = labels.type_as(outputs)

          val_loss += loss.item()

          predictions = torch.argmax(output_logits, axis=1)

          num_correct = sum(1 for a, b in zip(predictions, labels) if a == b)
          num_samples = predictions.shape[0]

          val_samples += num_samples
          val_correct += num_correct

    val_acc = val_correct / val_samples
    val_loss /= len(val_dataloader)

    if val_acc < prev_val_acc:
      k += 1

    if k == patience:
      break

    prev_val_acc = val_acc

    print(f"Train loss : {train_loss}, Val loss : {val_loss}, Val acc : {val_acc}")

Epoch 1/10


  'label_ids': torch.tensor(label_encoding['input_ids'], dtype=torch.long),
  return torch.tensor(preds_logits, dtype=torch.float)
Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.5671963706581606, Val loss : 3.6043297384606032, Val acc : 0.40655908381051536
Epoch 2/10


Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.5664876295474364, Val loss : 3.5090428126792683, Val acc : 0.40655908381051536
Epoch 3/10


Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.567019183026034, Val loss : 3.6043297032840917, Val acc : 0.40655908381051536
Epoch 4/10


Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.5675507583142316, Val loss : 3.6043297423691047, Val acc : 0.40655908381051536
Epoch 5/10


Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.567019206322652, Val loss : 3.509042808770767, Val acc : 0.40655908381051536
Epoch 6/10


Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.56701920235727, Val loss : 3.5090428322217746, Val acc : 0.40655908381051536
Epoch 7/10


Training batches: 100%|██████████| 481/481 [08:29<00:00,  1.06s/it]
61it [00:58,  1.04it/s]


Train loss : 3.56684200134198, Val loss : 3.509042824404772, Val acc : 0.40655908381051536
Epoch 8/10


Training batches:  44%|████▍     | 212/481 [03:44<04:43,  1.05s/it]

# Model Inference

In [None]:
from sklearn.metrics import f1_score

test_samples = 0
test_correct = 0

total_labels = []
total_predictions = []

with torch.no_grad():
  for batch_idx, batch in tqdm(enumerate(test_dataloader), desc='Performing inference'):
      input_ids = batch['input_ids'].squeeze(1).to(device)
      attention_mask = batch['attention_mask'].squeeze(1).to(device)
      label_ids = batch['label_ids'].squeeze(1).to(device)
      labels = batch['labels'].squeeze(1).to(device)

      output_logits = peft_model(
          input_ids=input_ids,
          attention_mask=attention_mask,
          labels=label_ids
      )

      predictions = torch.argmax(output_logits, axis=1)

      total_predictions.extend(predictions)
      total_labels.extend(labels.cpu())

      num_correct = sum(1 for a, b in zip(predictions, labels) if a == b)
      num_samples = predictions.shape[0]

      test_samples += num_samples
      test_correct += num_correct

macro_f1 = f1_score(total_labels, total_predictions, average='macro')
test_acc = test_correct/test_samples

print(f"Test acc : {test_acc:.4f}")
print(f"Test macro-F1 : {macro_f1:.4f}")

In [None]:
# use this to clear CUDA memory in case of OOM error
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
def find_common_strings(sentences1, sentences2):
  sent1 = set(sentences1)
  sent2 = set(sentences2)

  return len(sent1.intersection(sent2))

In [None]:
train_text = list(train['text'])
val_text = list(val['text'])
test_text = list(test['text'])

print("Number of common sentences (train, test): ", find_common_strings(train_text, test_text))
print("Number of common sentences (val, test): ", find_common_strings(val_text, test_text))