### Setup

In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments,
    Trainer,
    default_data_collator
)
from datasets import Dataset
import torch
import gc
import random
import numpy as np
from accelerate.utils import release_memory
import schedulefree
import pandas as pd
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

### Set parameters

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
project_name = "toxic-classifier"
model_names = [
    "bert-base-uncased",
    "bert-large-uncased",
    "microsoft/deberta-v3-base",
    "microsoft/deberta-v3-large",
    "FacebookAI/roberta-base",
    "FacebookAI/roberta-large",
    "google/flan-t5-base"
]
max_length = 256
batch_size = 16
gradient_accumulation = 4
learning_rate = 4e-5
num_train_epochs = 3
num_warmup_steps = 100
data_files = {'train': '../data/train_2024.csv', 'validation': '../data/dev_2024.csv'}
id2label = {0: "normal", 1: "toxic"}
label2id = {"normal": 0, "toxic": 1}

df = pd.read_csv(data_files['train'], quoting=3).rename(columns={'target': 'label'})
train_dataset = Dataset.from_pandas(df).shuffle(seed=SEED)

### Train All Models

In [None]:
for model_name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, model_max_length=max_length)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id).to(device)

    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=max_length)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    optimizer = schedulefree.AdamWScheduleFree(model.parameters(), lr=learning_rate, warmup_steps=num_warmup_steps)
    
    training_args = TrainingArguments(
        output_dir=f"{project_name}/{model_name}",
        group_by_length=False,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation,
        num_train_epochs=num_train_epochs,
        logging_dir=f"{project_name}/{model_name}/logs",
        logging_strategy="steps",
        logging_steps=200,
        save_strategy="epoch",
        bf16=True,
        tf32=True,
    )
    
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        optimizers=(optimizer, None),
        args=training_args,
        train_dataset=train_dataset,
    )
    
    if device == "cuda":
        flush()
        release_memory(model)
    
    trainer.train()

    if device == "cuda":
        del model
        del tokenizer
        del trainer
        del optimizer
        flush()

    print(f"Finished training {model_name}")

### Evaluate models

In [None]:
batch_size=32
checkpoint_iterations=[1547,3094,4641]
max_input_length = 512
file_path = '../data/train_2024.csv'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv(file_path, quoting=3).rename(columns={'target': 'label'})
validation_dataset = Dataset.from_pandas(df)

def evaluate(model, tokenizer, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return actual_labels, predictions

for checkpoint_iteration in checkpoint_iterations:
    print(f"####### checkpoint_iteration: {checkpoint_iteration}   #####")
    for model_name in model_names:
        saved_model_name = f"./{project_name}/{model_name}/checkpoint-{checkpoint_iteration}"
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, model_max_length=512)
        model = AutoModelForSequenceClassification.from_pretrained(saved_model_name).to(device)

        def tokenize_function(examples):
            return tokenizer(examples["text"], return_tensors="pt", padding="max_length", truncation=True, max_length=max_input_length)

        # Load testing dataset
        validation_dataset = validation_dataset.map(tokenize_function, batched=True)
        data_collator = default_data_collator
        validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator)

        # Do evaluation
        actual_labels, predictions = evaluate(model, tokenizer, validation_dataloader, device)

        accuracy = accuracy_score(actual_labels, predictions)
        report = classification_report(actual_labels, predictions)

        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

        results_df = pd.DataFrame({
            'Actual_Label': actual_labels,
            'Predicted_Label': predictions
        })

        results_df.to_csv(f"./{project_name}/{model_name}_{checkpoint_iteration}_predictions.csv", index=False)

        if device == "cuda":
            del model
            del tokenizer
            flush()

        print(f"Results for {model_name} saved.")

### Plot Confusion Matrices

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 16))
axes = axes.flatten()

for idx, model_name in enumerate(model_names):
    if model_name in ["FacebookAI/roberta-base","google/flan-t5-base"]:
        file_name = f"./{project_name}/{model_name}_4641_predictions.csv"
    else:
        file_name = f"./{project_name}/{model_name}_predictions.csv"

    results_df = pd.read_csv(file_name)
    cm = confusion_matrix(results_df['Actual_Label'], results_df['Predicted_Label'])
    accuracy = accuracy_score(results_df['Actual_Label'], results_df['Predicted_Label'])

    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', xticklabels=[0,1], yticklabels=[0,1], ax=axes[idx])
    axes[idx].set_xlabel('Predicted Labels')
    axes[idx].set_ylabel('True Labels')
    axes[idx].set_title(f'{model_name} | Accuracy: {accuracy:.3f}')

if len(model_names) < len(axes):
    for ax in axes[len(model_names):]:
        ax.set_visible(False)

fig.subplots_adjust(wspace=50.5, hspace=150.5)
plt.tight_layout()
plt.show()