# Setup Environment

In [28]:
from collections import Counter, defaultdict
from copy import deepcopy
from tqdm import tqdm

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import evaluate
from datasets import load_dataset, Dataset
from wordcloud import WordCloud
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader

from utils.hf_login import hf_login


device = "cuda" if torch.cuda.is_available() else "cpu"
hf_login()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[32m25-03-20 12:01:13 - [TheNotes] - INFO - Login to Hugging Face Hub successful 🎉[0m


True

# Setup Experiment

In [2]:
"""
The label function calculates a toxicity score by summing two columns: toxicity_ai and toxicity_human.
If the total toxicity score is greater than 5.5, it assigns a label of 1; otherwise, it assigns 0. 
The .astype(int) ensures the result is stored as an integer (binary classification).
"""


def label_annotations(annotated: pd.DataFrame) -> pd.DataFrame:
    # Annotations should be the annotated dataset
    label = ((annotated.toxicity_ai + annotated.toxicity_human) > 5.5).astype(int)
    labeled_annotations = pd.DataFrame()
    labeled_annotations["text"] = [i for i in annotated.text.tolist()]
    labeled_annotations["label"] = label
    return labeled_annotations

## EDA

In [3]:
toxigen_train_ds = load_dataset("skg/toxigen-data", name="train")

toxigen_train_ds

DatasetDict({
    train: Dataset({
        features: ['prompt', 'generation', 'generation_method', 'group', 'prompt_label', 'roberta_prediction'],
        num_rows: 250951
    })
})

In [4]:
toxigen_annotated = load_dataset("skg/toxigen-data", name="annotated")

toxigen_annotated

DatasetDict({
    test: Dataset({
        features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
        num_rows: 940
    })
    train: Dataset({
        features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
        num_rows: 8960
    })
})

### Explore training dataset

In [5]:
TG = pd.DataFrame(
    toxigen_train_ds["train"]
)  # Toxigen is stored in the train split of a huggingface dataset
TG.head()

Unnamed: 0,prompt,generation,generation_method,group,prompt_label,roberta_prediction
0,- the vast majority of african-americans are p...,"if youre white, you should be proud to be whit...",ALICE,black,1,0.001
1,- black people from the middle ages have alway...,the best thing that could have happened to the...,ALICE,black,1,0.044
2,- many african americans are so savage they lo...,theres no hope for the future because theres n...,ALICE,black,1,0.014
3,- the vast majority of african-americans are p...,"if youre white, you should be proud of it.",ALICE,black,1,0.002
4,"- whether a scientist or janitor, black people...","no matter how much money or power you have, if...",ALICE,black,1,0.328


In [6]:
# ToxiGen contains data for 13 groups
print(TG.group.unique())

['black' 'asian' 'native_american' 'muslim' 'latino' 'jewish' 'chinese'
 'lgbtq' 'mental_dis' 'physical_dis' 'mexican' 'women' 'middle_east']


In [7]:
# Example of loading only neutral examples
neutral_examples = TG[TG.prompt_label == 0.0]
neutral_examples.head()

Unnamed: 0,prompt,generation,generation_method,group,prompt_label,roberta_prediction
57948,- african american women have a different inve...,"a black transgender man, regularly talks need...",top-k,black,0,0.001
57949,- african american women have a different inve...,black girls don't have a seat at the table of...,top-k,black,0,0.001
57950,- african american women have a different inve...,"by and large, white americans have been uniqu...",top-k,black,0,0.001
57951,- african american women have a different inve...,it seems that there's still a deep discomfort...,top-k,black,0,0.001
57952,- african american women have a different inve...,Black womens equality and role within society...,top-k,black,0,0.001


## Explore human annotated dataset

In [8]:
human_eval_big = pd.DataFrame(toxigen_annotated["train"])
human_eval_small = pd.DataFrame(toxigen_annotated["test"])

In [9]:
print(human_eval_big.shape)
human_eval_big.head()

(8960, 13)


Unnamed: 0,text,target_group,factual?,ingroup_effect,lewd,framing,predicted_group,stereotyping,intent,toxicity_ai,toxicity_human,predicted_author,actual_method
0,asians are trying to take over the world with ...,asian,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,the text directly references asian,uses positive stereotyping,5.0,5.0,5.0,human,topk
1,i have never seen a hotter lineup in my life t...,mexican,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.0,1.0,1.0,human,topk
2,muslims deseve to die,muslim,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references muslim,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk
3,people who are deaf or hard of hearing should ...,physical_dis,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,['the text indirectly references physical_dis'...,doesn't use positive stereotyping,1.0,1.0,1.0,human,cbs
4,Jews were all bad people because they're in th...,jewish,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references jewish,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk


In [10]:
for col in human_eval_big.columns:
    print(f"\nUnique values in {col}:")
    print(human_eval_big[col].value_counts())


Unique values in text:
text
asians are trying to take over the world with their karate skills                                                                                  1
if you want to get a good job, you have to have a good education, and if you want to get a good education, you have to                             1
every time i see, a latina walking down the street (one who will have sex with me) i yell "i hate u lesbians!" -                                   1
if i see you in public, i will call the police on you because i think you are a terrorist -" i will call the police on                             1
When you have 2 tabs open, it is called multitasking, when you have more than 3 it's called SUPER WOMAN.\ncredits The                              1
                                                                                                                                                  ..
jewish puns can be really irritating, i'm not a jew and i find them annoying 

### EDA on human annotated eval dataset

In [11]:
print(human_eval_small.shape)
human_eval_small.head()

(940, 13)


Unnamed: 0,text,target_group,factual?,ingroup_effect,lewd,framing,predicted_group,stereotyping,intent,toxicity_ai,toxicity_human,predicted_author,actual_method
0,this statement indicates that black men are se...,black/african-american folks,opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references black/african-ame...,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk
1,"as a result of this, they have been given the...",black folks / african-americans,disagreement,doesn't use in-group language,doesn't contain lewd content,cause,['the text indirectly references black folks /...,doesn't use positive stereotyping,2.0,3.0,2.333333,human,cbs
2,I am not a fan of the salsa music,mexican folks,opinion,doesn't use in-group language,doesn't contain lewd content,moral,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.666667,2.0,2.0,human,topk
3,how to be a man\n- how to get over your wife ...,women,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.333333,1.0,1.0,ai,topk
4,the only good thing that can be said about the...,native american/indigenous folks,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references native american/i...,doesn't use positive stereotyping,5.0,5.0,5.0,human,cbs


In [15]:
data_big = label_annotations(human_eval_big)
data_big.head()

Unnamed: 0,text,label
0,asians are trying to take over the world with ...,1
1,i have never seen a hotter lineup in my life t...,0
2,muslims deseve to die,1
3,people who are deaf or hard of hearing should ...,0
4,Jews were all bad people because they're in th...,1


In [16]:
data_small = label_annotations(human_eval_small)
data_small.head()

Unnamed: 0,text,label
0,this statement indicates that black men are se...,1
1,"as a result of this, they have been given the...",0
2,I am not a fan of the salsa music,0
3,how to be a man\n- how to get over your wife ...,0
4,the only good thing that can be said about the...,1


In [19]:
# Create a mapping from label to index
label2idx = {"non-toxic": 0, "toxic": 1}

# Create a reverse mapping from index to label
idx2label = {idx: label for label, idx in label2idx.items()}

print("label2idx:", label2idx)
print("idx2label:", idx2label)

label2idx: {'non-toxic': 0, 'toxic': 1}
idx2label: {0: 'non-toxic', 1: 'toxic'}


## Model Setup

In [20]:
model_ids = [
    "google-bert/bert-base-uncased",
    "distilbert/distilbert-base-uncased",
    "BAAI/bge-reranker-base",
]
modern_bert = "answerdotai/ModernBERT-base"
model = AutoModelForSequenceClassification.from_pretrained(
    modern_bert, num_labels=2, label2id=label2idx, id2label=idx2label
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
tokenizer = AutoTokenizer.from_pretrained(modern_bert)

tokenizer

PreTrainedTokenizerFast(name_or_path='answerdotai/ModernBERT-base', vocab_size=50280, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("|||IP_ADDRESS|||", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50257: AddedToken("                    

## Training/Validating Process

In [29]:
dataset_big = Dataset.from_pandas(data_big)
dataset_small = Dataset.from_pandas(data_small)

In [31]:
def preprocess_function(samples):
    return tokenizer(samples["text"], truncation=True)

In [37]:
tokenized_data_big = dataset_big.map(preprocess_function, batched=True)
tokenized_data_big

Map:   0%|          | 0/8960 [00:00<?, ? examples/s]

Map: 100%|██████████| 8960/8960 [00:00<00:00, 14496.30 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 8960
})

In [38]:
tokenized_data_big[0]

{'text': 'asians are trying to take over the world with their karate skills',
 'label': 1,
 'input_ids': [50281,
  284,
  2458,
  403,
  2820,
  281,
  1379,
  689,
  253,
  1533,
  342,
  616,
  465,
  33573,
  6936,
  50282],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [39]:
tokenized_data_small = dataset_small.map(preprocess_function, batched=True)
tokenized_data_small

Map: 100%|██████████| 940/940 [00:00<00:00, 27106.91 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 940
})

In [36]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [46]:
import evaluate
import numpy as np

# Load evaluation metrics from `evaluate`
accuracy = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")


def compute_metrics(p):
    pred, labels = p
    predictions = np.argmax(pred, axis=1)

    # Compute metrics using evaluate library
    acc = accuracy.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(
        predictions=predictions, references=labels, average="binary"
    )
    recall = recall_metric.compute(
        predictions=predictions, references=labels, average="binary"
    )
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")

    return {
        "accuracy": acc["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

Downloading builder script: 100%|██████████| 7.56k/7.56k [00:00<00:00, 5.43MB/s]
Downloading builder script: 100%|██████████| 7.38k/7.38k [00:00<00:00, 8.42MB/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<00:00, 6.18MB/s]


In [48]:
from copy import deepcopy

clone_model = deepcopy(model)
clone_model

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [50]:
training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    eval_strategy="steps",
    weight_decay=0.01,
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=True,
    warmup_steps=500,
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=3,
    hub_model_id="modernbert-toxigen",
)

trainer = Trainer(
    model=clone_model,
    args=training_args,
    train_dataset=tokenized_data_big,
    eval_dataset=tokenized_data_small,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.1581,0.472563,0.810638,0.798429,0.751232,0.774112
200,0.1095,0.643067,0.817021,0.861111,0.687192,0.764384
300,0.0761,0.696882,0.821277,0.800505,0.780788,0.790524
400,0.0829,0.748772,0.829787,0.823684,0.770936,0.796438
500,0.1055,0.746101,0.819149,0.853293,0.70197,0.77027
600,0.0743,1.027835,0.825532,0.813472,0.773399,0.792929
700,0.0621,1.058899,0.825532,0.80402,0.788177,0.79602
800,0.066,0.953725,0.810638,0.792308,0.761084,0.776382
900,0.0282,1.378522,0.82234,0.816976,0.758621,0.786718
1000,0.0311,1.451581,0.823404,0.875,0.689655,0.77135


Could not locate the best model at trainer_output/checkpoint-100/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=2800, training_loss=0.030974410449977896, metrics={'train_runtime': 460.9103, 'train_samples_per_second': 194.398, 'train_steps_per_second': 6.075, 'total_flos': 2030618599140096.0, 'train_loss': 0.030974410449977896, 'epoch': 10.0})

## Testing and inference