In [1]:
import os

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    get_linear_schedule_with_warmup,
)

In [2]:
# Step 1: Prepare Dataset


class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.texts = df["text"].values
        self.targets = df["gold_label"].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        target = self.targets[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.long),  # Ensure correct dtype
        }

In [3]:
# Step 2: Convert String Labels to Integers
label_mapping = {
    "No Hate/Toxic": 0,
    "Toxic 1": 1,
    "Toxic 2": 2,
    "Toxic 3": 3,
    "Hate 1": 4,
    "Hate 2": 5,
    "Hate 3": 6,
}

In [4]:
# Load dataset
df = pd.read_csv("final_data.csv")

In [5]:
# Map string labels to integers
df["gold_label"] = df["gold_label"].map(label_mapping)

In [6]:
# Stratified train-test split to maintain class balance
train_df, temp_df = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df["gold_label"]
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df["gold_label"]
)

In [7]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
# Parameters
MAX_LEN = 128
BATCH_SIZE = 128
EPOCHS = 3
LEARNING_RATE = 1e-5
NUM_CLASSES = 7  # For multiclass classification

In [9]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [11]:
# Load configuration
config = DistilBertConfig.from_pretrained("distilbert-base-multilingual-cased")

# Set dropout rates and number of labels
config.attention_probs_dropout_prob = 0.2  # Increase dropout rate
config.hidden_dropout_prob = 0.2  # Increase dropout rate
config.num_labels = 7  # Number of output classes

# Initialize the model with the modified configuration
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-multilingual-cased", config=config
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Directory to save the model
save_path = (
    r"C:\Users\richm\OneDrive\Desktop\DSA4264\DSA4264-Detoxify\model-1\distilbert"
)
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [13]:
# Calculate total number of training steps
total_steps = len(train_loader) * EPOCHS

# Define warmup steps (e.g., 10% of total steps)
num_warmup_steps = int(0.1 * total_steps)

# Scheduler with warmup for the first X steps, then linear decay
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps
)

In [None]:
# Calculate total number of training steps
total_steps = len(train_loader) * EPOCHS

# Define warmup steps (e.g., 10% of total steps)
num_warmup_steps = int(0.1 * total_steps)

# Scheduler with warmup for the first X steps, then linear decay
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps
)

In [None]:
# Step 6: Training and Evaluation Functions with tqdm


def train_epoch(model, data_loader, optimizer, criterion, device, scheduler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    # Wrap data_loader with tqdm to show progress
    progress_bar = tqdm(data_loader, desc="Training", unit="batch")

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, targets)
        total_loss += loss.item()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        total_samples += targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Add scheduler step here

        # Update progress bar description with running metrics
        progress_bar.set_postfix(
            {
                "loss": total_loss / (total_samples / BATCH_SIZE),
                "accuracy": correct_predictions.double() / total_samples,
            }
        )

    accuracy = correct_predictions.double() / total_samples
    return total_loss / len(data_loader), accuracy.item()

In [14]:
class_names = [label for label, _ in sorted(label_mapping.items(), key=lambda x: x[1])]


def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    # Wrap data_loader with tqdm to show progress during evaluation
    progress_bar = tqdm(data_loader, desc="Validating", unit="batch")

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, targets)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            total_samples += targets.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

            # Update progress bar description with running metrics
            progress_bar.set_postfix(
                {
                    "loss": total_loss / (total_samples / BATCH_SIZE),
                    "accuracy": correct_predictions.double() / total_samples,
                }
            )

    accuracy = correct_predictions.double() / total_samples
    f1 = f1_score(all_labels, all_preds, average="weighted")

    # Print classification report to get precision, recall, and F1 per class
    class_report = classification_report(
        all_labels, all_preds, target_names=class_names
    )
    print("Classification Report:\n", class_report)

    return total_loss / len(data_loader), accuracy.item(), f1

In [15]:
# Initialize variable to track the best validation loss
best_val_loss = float("inf")  # Start with a very high value

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")

    # Pass the scheduler to train_epoch
    train_loss, train_acc = train_epoch(
        model, train_loader, optimizer, criterion, device, scheduler
    )
    print(f"Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}")

    val_loss, val_acc, val_f1 = eval_model(model, val_loader, criterion, device)
    print(
        f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}, Validation F1: {val_f1:.4f}"
    )

    # Check if the current validation loss is better than the best we've seen
    if val_loss < best_val_loss:
        best_val_loss = val_loss  # Update the best validation loss
        best_model = model.state_dict()  # Save the current model state

# Save the best model after training
model_save_path = os.path.join(save_path, "best_distilbert_model.pth")
torch.save(best_model, model_save_path)
print(
    f"Best model saved at {model_save_path} with validation loss: {best_val_loss:.4f}"
)


Epoch 1/3


Training:   8%|▊         | 29/383 [06:19<1:17:14, 13.09s/batch, loss=1.78, accuracy=tensor(0.4321, dtype=torch.float64)]


KeyboardInterrupt: 

In [None]:
# Save the final model after training
model_save_path = os.path.join(save_path, "distilbert_model.pth")
torch.save(model.state_dict(), model_save_path)
print(f"Final model saved at {model_save_path}")

In [None]:
# Step 8: Evaluate the Model on Test Set
# Print the size of the test dataset
test_data_size = len(test_loader.dataset)
print(f"Exact length of test data: {test_data_size}\n")  # Added new line for clarity

# Evaluate the model on the test set and print metrics
test_loss, test_acc, test_f1 = eval_model(model, test_loader, criterion, device)
print(
    f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}"
)