In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from data.dataset import HateSpeechDataset
from data.toxigen import ToxiGenDataModule, custom_label_strategy
from models.model import HateSpeechClassifier
from models.classification_heads import SimpleLinearHead, MLPHead
from training.trainer import Trainer
from utils.checkpoints import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# TODO: Use Config file for this step

config = {
        "run_name": "mlp_head",
        "model_name": "microsoft/deberta-v3-base",
        "num_classes": 2,
        "batch_size": 32,
        "learning_rate": 2e-5,
        "num_epochs": 3,
        "max_length": 128,
        "num_workers": 4,
        "device": "cuda" if torch.cuda.is_available() else "cpu",
    }

In [3]:
data_module = ToxiGenDataModule(
        tokenizer_name=config["model_name"],
        batch_size=config["batch_size"],
        max_length=config["max_length"],
        label_strategy=custom_label_strategy,
        num_workers=config["num_workers"],
    )

    # Setup datasets and get dataloaders
data_module.setup()
dataloaders = data_module.get_dataloaders()
train_loader = dataloaders["train"]
val_loader = dataloaders["test"]



In [4]:
classification_head = MLPHead(
        768, 1536, 384, config["num_classes"]
    )  # 768 is BERT's hidden size
model = HateSpeechClassifier(
        config["model_name"], classification_head, freeze_bert=True
    ).to(config["device"])

optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
criterion = torch.nn.CrossEntropyLoss()

In [5]:
# Initialize trainer
trainer = Trainer(
        model=model, optimizer=optimizer, criterion=criterion, device=config["device"]
    )

In [6]:
for epoch in range(config["num_epochs"]):
        train_loss = trainer.train_epoch(train_loader)
        val_metrics = trainer.evaluate(val_loader)

        print(f"Epoch {epoch+1}/{config['num_epochs']}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_metrics['loss']:.4f}")

Training:   0%|          | 0/280 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch 1/3
Train Loss: 0.6555
Val Loss: 0.6517


Training:   0%|          | 0/280 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch 2/3
Train Loss: 0.6515
Val Loss: 0.5984


Training:   0%|          | 0/280 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Epoch 3/3
Train Loss: 0.6483
Val Loss: 0.5843





In [7]:
ModelCheckpoint.save_checkpoint(
        model, optimizer, epoch, val_metrics["loss"],{}, "checkpoints",config, config["run_name"]
    )


'checkpoints/mlp_head.pt'

In [8]:
checkpoint = ModelCheckpoint.load_checkpoint("checkpoints/checkpoint.pt", model, optimizer)

RuntimeError: Error(s) in loading state_dict for HateSpeechClassifier:
	Missing key(s) in state_dict: "classification_head.classifier.0.weight", "classification_head.classifier.0.bias", "classification_head.classifier.2.weight", "classification_head.classifier.2.bias", "classification_head.classifier.4.weight", "classification_head.classifier.4.bias". 
	Unexpected key(s) in state_dict: "classification_head.classifier.weight", "classification_head.classifier.bias". 

In [9]:
from utils.predictor import HateSpeechPredictor

predictor = HateSpeechPredictor(model, config["model_name"])



In [11]:
texts = ["Where should i begin? I am so happy right now", "I love you", "I hate gays", "I hate you", "Black people are the worst"] 
prediction = predictor.predict(texts)
print(prediction)

[{'text': 'Where should i begin? I am so happy right now', 'prediction': 1, 'toxic_probability': 0.6291897296905518, 'non_toxic_probability': 0.37081027030944824}, {'text': 'I love you', 'prediction': 1, 'toxic_probability': 0.5588224530220032, 'non_toxic_probability': 0.4411775469779968}, {'text': 'I hate gays', 'prediction': 1, 'toxic_probability': 0.5964447855949402, 'non_toxic_probability': 0.4035552144050598}, {'text': 'I hate you', 'prediction': 1, 'toxic_probability': 0.6044718623161316, 'non_toxic_probability': 0.39552807807922363}, {'text': 'Black people are the worst', 'prediction': 1, 'toxic_probability': 0.6375484466552734, 'non_toxic_probability': 0.36245155334472656}]


In [14]:
from data.toxigen import ToxiGenDataset


ds=ToxiGenDataset("train",config["model_name"])
ds.dataset



Dataset({
    features: ['text', 'target_group', 'factual?', 'ingroup_effect', 'lewd', 'framing', 'predicted_group', 'stereotyping', 'intent', 'toxicity_ai', 'toxicity_human', 'predicted_author', 'actual_method'],
    num_rows: 8960
})