In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.cuda.amp as amp
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Ensure Transformers uses PyTorch instead of TensorFlow
os.environ["TRANSFORMERS_NO_TF"] = "1"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv("/content/USElectionHateSpeech - Aug_chunk1.csv")

# Ensure labels are integers
df["Hate"] = pd.to_numeric(df["Hate"], errors="coerce")
df = df.dropna(subset=["Hate"])  # Drop rows where 'Hate' could not be converted
df["Hate"] = df["Hate"].astype(int)  # Convert to integer

# Balance Dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df[["text"]], df["Hate"])
df = pd.DataFrame({"text": X_resampled["text"], "Hate": y_resampled})

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

# Custom Dataset Class
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return {"input_ids": encoding["input_ids"].squeeze(),
                "attention_mask": encoding["attention_mask"].squeeze(),
                "labels": torch.tensor(int(self.labels[idx]), dtype=torch.long)}

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"].tolist(), df["Hate"].tolist(), test_size=0.2, random_state=42)
train_dataset = HateSpeechDataset(train_texts, train_labels)
val_dataset = HateSpeechDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Load DistilBERT model and REMOVE one transformer layer
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

# Remove one transformer layer
model.distilbert.transformer.layer = model.distilbert.transformer.layer[:-1]  # Removing last layer

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 6)

scaler = amp.GradScaler()

# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions)

# Training Loop
epochs = 6
early_stopping_patience = 2
best_accuracy = 0
patience_counter = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)

        with amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=total_loss / len(train_loader))

    # Validation
    accuracy, report = evaluate(model, val_loader)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)

    # Early Stopping
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
        model.save_pretrained("best_hate_speech_model")
        tokenizer.save_pretrained("best_hate_speech_model")
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

print("Training complete!")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = amp.GradScaler()
  with amp.autocast():
Epoch 1: 100%|██████████| 34/34 [00:04<00:00,  7.20it/s, loss=0.654]


Validation Accuracy: 0.7175
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.61      0.67       125
           1       0.70      0.81      0.75       144

    accuracy                           0.72       269
   macro avg       0.72      0.71      0.71       269
weighted avg       0.72      0.72      0.71       269



  with amp.autocast():
Epoch 2: 100%|██████████| 34/34 [00:03<00:00,  9.80it/s, loss=0.481]


Validation Accuracy: 0.7472
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.80      0.75       125
           1       0.80      0.70      0.75       144

    accuracy                           0.75       269
   macro avg       0.75      0.75      0.75       269
weighted avg       0.75      0.75      0.75       269



  with amp.autocast():
Epoch 3: 100%|██████████| 34/34 [00:03<00:00,  9.74it/s, loss=0.321]


Validation Accuracy: 0.7881
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.76      0.77       125
           1       0.80      0.81      0.80       144

    accuracy                           0.79       269
   macro avg       0.79      0.79      0.79       269
weighted avg       0.79      0.79      0.79       269



  with amp.autocast():
Epoch 4: 100%|██████████| 34/34 [00:03<00:00,  9.14it/s, loss=0.206]


Validation Accuracy: 0.7844
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76       125
           1       0.79      0.81      0.80       144

    accuracy                           0.78       269
   macro avg       0.78      0.78      0.78       269
weighted avg       0.78      0.78      0.78       269



  with amp.autocast():
Epoch 5: 100%|██████████| 34/34 [00:03<00:00,  9.65it/s, loss=0.152]


Validation Accuracy: 0.8141
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.82      0.80       125
           1       0.84      0.81      0.82       144

    accuracy                           0.81       269
   macro avg       0.81      0.81      0.81       269
weighted avg       0.82      0.81      0.81       269



  with amp.autocast():
Epoch 6: 100%|██████████| 34/34 [00:03<00:00,  8.99it/s, loss=0.121]


Validation Accuracy: 0.8253
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.79      0.81       125
           1       0.83      0.85      0.84       144

    accuracy                           0.83       269
   macro avg       0.83      0.82      0.82       269
weighted avg       0.83      0.83      0.82       269

Training complete!
