# Baseline ModernBERT

In [1]:
from datetime import datetime
from datasets import load_dataset
from evaluate import load
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
import numpy as np
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
train_dataset = load_dataset("SetFit/amazon_massive_intent_sv-SE", split="train")
val_dataset = load_dataset("SetFit/amazon_massive_intent_sv-SE", split="validation")
test_dataset = load_dataset("SetFit/amazon_massive_intent_sv-SE", split="test")

unique_pairs = set(zip(train_dataset["label_text"], train_dataset["label"]))
label2id = {text: label_id for text, label_id in unique_pairs}
id2label = {label_id: text for text, label_id in unique_pairs}

display("Train: " + str(train_dataset))
display("Val:   " + str(val_dataset))
display("Test:  " + str(test_dataset))

"Train: Dataset({\n    features: ['id', 'label', 'text', 'label_text'],\n    num_rows: 11514\n})"

"Val:   Dataset({\n    features: ['id', 'label', 'text', 'label_text'],\n    num_rows: 2033\n})"

"Test:  Dataset({\n    features: ['id', 'label', 'text', 'label_text'],\n    num_rows: 2974\n})"

In [3]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")


# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)


# Tokenize datasets
tokenized_datasets_train = train_dataset.map(tokenize_function, batched=True)
tokenized_datasets_val = val_dataset.map(tokenize_function, batched=True)
tokenized_datasets_test = test_dataset.map(tokenize_function, batched=True)


# Load metrics
accuracy = load("accuracy")
f1 = load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(
        predictions=predictions, references=labels, average="weighted"
    )

    return {
        "accuracy": accuracy_score["accuracy"],
        "f1": f1_score["f1"],
    }

In [None]:
# Load model with increased dropout
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    label2id=label2id,
    id2label=id2label,
    attn_implementation="sdpa",
)

learning_rate = 3e-5
batch_size = 32
epochs = 1
model_name = "ModernBERT"

timestamp = datetime.now().strftime("%y%m%d_%H%M")
run_name = f"{timestamp}_{model_name}_ep{epochs}_lr{learning_rate}_bs{batch_size}"

training_args = TrainingArguments(
    output_dir=f"./results/{run_name}",
    logging_dir=f"./results/{run_name}/logs",
    report_to=["tensorboard"],
    run_name=run_name,

    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=64,
    num_train_epochs=epochs,
    bf16=True,
    
    logging_strategy="steps",
    eval_strategy="steps",
    save_strategy="steps",

    logging_steps=100,
    eval_steps=200,
    save_steps=200,

    seed=42,
    torch_compile=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=1.0,
    label_smoothing_factor=0.1,
)

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)
    ],
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [5]:
# Train the model
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Step,Training Loss,Validation Loss,Accuracy,F1
200,2.8013,2.485767,0.470733,0.461498


W0906 10:36:09.446000 38455 torch/fx/experimental/symbolic_shapes.py:6823] [0/1] _maybe_guard_rel() was called on non-relation expression Eq(s16, 1) | Eq(s27, s16)
W0906 10:36:34.188000 38455 torch/fx/experimental/symbolic_shapes.py:6823] [0/2] _maybe_guard_rel() was called on non-relation expression Eq(s52, s92) | Eq(s92, 1)
W0906 10:36:34.190000 38455 torch/fx/experimental/symbolic_shapes.py:6823] [0/2] _maybe_guard_rel() was called on non-relation expression Eq(s16, 1) | Eq(s27, s16)


TrainOutput(global_step=360, training_loss=2.832405302259657, metrics={'train_runtime': 32.0419, 'train_samples_per_second': 359.342, 'train_steps_per_second': 11.235, 'total_flos': 232101463608000.0, 'train_loss': 2.832405302259657, 'epoch': 1.0})

In [6]:
import json

# Evaluate the model with the test data
eval_results = trainer.evaluate(eval_dataset=tokenized_datasets_test)
print(json.dumps(eval_results,indent=4))

{
    "eval_loss": 2.459702730178833,
    "eval_accuracy": 0.4761264290517821,
    "eval_f1": 0.4625420107402667,
    "eval_runtime": 1.1251,
    "eval_samples_per_second": 2643.277,
    "eval_steps_per_second": 41.773,
    "epoch": 1.0
}
