In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
project_name = "toxic-classifier"
model_name = "google/flan-t5-base"
saved_model_name = "./toxic-classifier/checkpoint-6164"
batch_size=32
data_files = {'validation': '../data/dev.csv'}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(saved_model_name).to(device)
#model = torch.compile(model, backend="inductor")

In [4]:
# Load and preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

validation_dataset = load_dataset('csv', data_files=data_files)['validation']
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# DataLoader
data_collator = default_data_collator
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator)

Map: 100%|██████████| 10980/10980 [00:01<00:00, 6443.31 examples/s]


In [5]:
# Manual F1 and Accuracy Calculation
def manual_metrics(predictions, references):
    tp = (predictions & references).sum().item()
    tn = ((~predictions) & (~references)).sum().item()
    fp = (predictions & (~references)).sum().item()
    fn = ((~predictions) & references).sum().item()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    return {"f1": f1, "accuracy": accuracy}

# Evaluation
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    progress_bar = tqdm(dataloader, desc="Evaluating")
    with torch.no_grad():
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())
            torch.cuda.synchronize()

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    metrics = manual_metrics(all_preds == 1, all_labels == 1)  # Assuming binary classification
    return metrics

In [6]:
# Evaluate the model
with torch.inference_mode():
    results = evaluate_model(model, validation_dataloader)
    print(results)

Evaluating: 100%|██████████| 344/344 [04:27<00:00,  1.29it/s]

{'f1': 0.9349981637899376, 'accuracy': 0.9516393442622951}



