# Setup Environment

In [2]:
from collections import Counter, defaultdict
from copy import deepcopy
from tqdm import tqdm

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import evaluate
from datasets import load_dataset
from wordcloud import WordCloud
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader

from utils.hf_login import hf_login


device = "cuda" if torch.cuda.is_available() else "cpu"
hf_login()

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[32m25-03-09 12:14:13 - [TheNotes] - INFO - Login to Hugging Face Hub successful 🎉[0m


True

# Setup Experiment

In [None]:
"""
The label function calculates a toxicity score by summing two columns: toxicity_ai and toxicity_human.
If the total toxicity score is greater than 5.5, it assigns a label of 1; otherwise, it assigns 0. 
The .astype(int) ensures the result is stored as an integer (binary classification).
"""


def label_annotations(annotated: pd.DataFrame) -> pd.DataFrame:
    # Annotations should be the annotated dataset
    label = ((annotated.toxicity_ai + annotated.toxicity_human) > 5.5).astype(int)
    labeled_annotations = pd.DataFrame()
    labeled_annotations["text"] = [i for i in annotated.text.tolist()]
    labeled_annotations["label"] = label
    return labeled_annotations

## EDA

In [18]:
toxigen_train_ds = load_dataset("skg/toxigen-data", name="train")

toxigen_train_ds

DatasetDict({
    train: Dataset({
        features: ['prompt', 'generation', 'generation_method', 'group', 'prompt_label', 'roberta_prediction'],
        num_rows: 250951
    })
})

In [19]:
toxigen_annotated = load_dataset("skg/toxigen-data", name="annotated")

toxigen_annotated

Generating test split: 100%|██████████| 940/940 [00:00<00:00, 272206.97 examples/s]
Generating train split: 100%|██████████| 8960/8960 [00:00<00:00, 742604.06 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
        num_rows: 940
    })
    train: Dataset({
        features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
        num_rows: 8960
    })
})

### Explore training dataset

In [20]:
TG = pd.DataFrame(
    toxigen_train_ds["train"]
)  # Toxigen is stored in the train split of a huggingface dataset
TG.head()

Unnamed: 0,prompt,generation,generation_method,group,prompt_label,roberta_prediction
0,- the vast majority of african-americans are p...,"if youre white, you should be proud to be whit...",ALICE,black,1,0.001
1,- black people from the middle ages have alway...,the best thing that could have happened to the...,ALICE,black,1,0.044
2,- many african americans are so savage they lo...,theres no hope for the future because theres n...,ALICE,black,1,0.014
3,- the vast majority of african-americans are p...,"if youre white, you should be proud of it.",ALICE,black,1,0.002
4,"- whether a scientist or janitor, black people...","no matter how much money or power you have, if...",ALICE,black,1,0.328


In [21]:
# ToxiGen contains data for 13 groups
print(TG.group.unique())

['black' 'asian' 'native_american' 'muslim' 'latino' 'jewish' 'chinese'
 'lgbtq' 'mental_dis' 'physical_dis' 'mexican' 'women' 'middle_east']


In [22]:
# Example of loading only neutral examples
neutral_examples = TG[TG.prompt_label == 0.0]
neutral_examples.head()

Unnamed: 0,prompt,generation,generation_method,group,prompt_label,roberta_prediction
57948,- african american women have a different inve...,"a black transgender man, regularly talks need...",top-k,black,0,0.001
57949,- african american women have a different inve...,black girls don't have a seat at the table of...,top-k,black,0,0.001
57950,- african american women have a different inve...,"by and large, white americans have been uniqu...",top-k,black,0,0.001
57951,- african american women have a different inve...,it seems that there's still a deep discomfort...,top-k,black,0,0.001
57952,- african american women have a different inve...,Black womens equality and role within society...,top-k,black,0,0.001


## Explore human annotated dataset

In [23]:
human_eval_big = pd.DataFrame(toxigen_annotated["train"])
human_eval_small = pd.DataFrame(toxigen_annotated["test"])

In [26]:
print(human_eval_big.shape)
human_eval_big.head()

(8960, 13)


Unnamed: 0,text,target_group,factual?,ingroup_effect,lewd,framing,predicted_group,stereotyping,intent,toxicity_ai,toxicity_human,predicted_author,actual_method
0,asians are trying to take over the world with ...,asian,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,the text directly references asian,uses positive stereotyping,5.0,5.0,5.0,human,topk
1,i have never seen a hotter lineup in my life t...,mexican,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.0,1.0,1.0,human,topk
2,muslims deseve to die,muslim,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references muslim,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk
3,people who are deaf or hard of hearing should ...,physical_dis,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,['the text indirectly references physical_dis'...,doesn't use positive stereotyping,1.0,1.0,1.0,human,cbs
4,Jews were all bad people because they're in th...,jewish,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references jewish,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk


In [31]:
for col in human_eval_big.columns:
    print(f"\nUnique values in {col}:")
    print(human_eval_big[col].value_counts())


Unique values in text:
text
asians are trying to take over the world with their karate skills                                                                                  1
if you want to get a good job, you have to have a good education, and if you want to get a good education, you have to                             1
every time i see, a latina walking down the street (one who will have sex with me) i yell "i hate u lesbians!" -                                   1
if i see you in public, i will call the police on you because i think you are a terrorist -" i will call the police on                             1
When you have 2 tabs open, it is called multitasking, when you have more than 3 it's called SUPER WOMAN.\ncredits The                              1
                                                                                                                                                  ..
jewish puns can be really irritating, i'm not a jew and i find them annoying 

### EDA on human annotated eval dataset

In [None]:
print(human_eval_small.shape)
human_eval_small.head()

(8960, 13)


Unnamed: 0,text,target_group,factual?,ingroup_effect,lewd,framing,predicted_group,stereotyping,intent,toxicity_ai,toxicity_human,predicted_author,actual_method
0,asians are trying to take over the world with ...,asian,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,the text directly references asian,uses positive stereotyping,5.0,5.0,5.0,human,topk
1,i have never seen a hotter lineup in my life t...,mexican,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,,"['it is unclear who is referenced', 'it is unc...",doesn't use positive stereotyping,1.0,1.0,1.0,human,topk
2,muslims deseve to die,muslim,neither factual nor opinion,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references muslim,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk
3,people who are deaf or hard of hearing should ...,physical_dis,opinion,doesn't use in-group language,doesn't contain lewd content,disagreement,['the text indirectly references physical_dis'...,doesn't use positive stereotyping,1.0,1.0,1.0,human,cbs
4,Jews were all bad people because they're in th...,jewish,disagreement,doesn't use in-group language,doesn't contain lewd content,moral,the text directly references jewish,doesn't use positive stereotyping,5.0,5.0,5.0,human,topk


## Model Setup

In [None]:
model_ids = []

## Training/Validating Process

In [None]:
labels = dataset["train"].features["label"].names
print("labels:", labels)

# Create a mapping from label to index
label2idx = {label: idx for idx, label in enumerate(labels)}

# Create a reverse mapping from index to label
idx2label = {idx: label for label, idx in label2idx.items()}

print("label2idx:", label2idx)
print("idx2label:", idx2label)

## Testing and inference