In [None]:
# Upgrade Pytorch & other libraries
%pip install --upgrade --quiet \
    torch torchvision torchaudio \
    transformers accelerate datasets

In [1]:
import os
from pathlib import Path

import numpy as np
import torch
import datasets
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn import metrics

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [2]:
model_name_or_path = "answerdotai/ModernBERT-base"
dataset_path = "../bin/multirc_dataset.hf"
output_dir = "../bin/modernbert-multirc"

batch_size = 4
num_epochs = 8
learning_rate = 3e-5
seed = 42

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [3]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        num_labels=2,
        label2id={"incorrect": 0, "correct": 1},
        id2label={0: "incorrect", 1: "correct"},
    )   
    return model

In [4]:
def preprocess_function(example):
    return tokenizer(example["text"], truncation=True)

ds = datasets.DatasetDict.load_from_disk(dataset_path)
ds = ds.map(preprocess_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/4080 [00:00<?, ? examples/s]

In [5]:
ds["test"]["labels"][0:10]

[1, 0, 0, 0, 1, 0, 1, 0, 0, 1]

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['index', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 19170
    })
    valid: Dataset({
        features: ['index', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4080
    })
    test: Dataset({
        features: ['index', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3962
    })
})

In [7]:
def compute_metrics(eval_pred):
    """    
    Parameters:
    -----------
    eval_pred : tuple
        A tuple of (logits, labels) provided by the Hugging Face Trainer.
        - logits: numpy array of shape (n_samples, 2) for binary classification
        - labels: numpy array of shape (n_samples,)
    
    Returns:
    --------
    dict
        Dictionary containing various metrics:
        - accuracy: Accuracy score
        - precision: Precision score
        - recall: Recall score
        - f1: F1 score
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    # Calculate accuracy
    accuracy = metrics.accuracy_score(labels, preds)
    
    # Calculate precision, recall, f1
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(
        labels, 
        preds, 
        average='macro',
        zero_division=0,
    )
    
    # Return metrics dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [9]:
def test_modernbert(ds):
    """
    Make sure ModernBert model is not returning NaNs.

    If the logits tensor has NaN values, there is a dependency issue.
    """
    model = model_init().to("cuda")

    with torch.no_grad():
        batch = ds["train"][0]
        input_ids = torch.tensor([batch["input_ids"]])
        attention_mask=torch.tensor([batch["attention_mask"]])
        token_type_ids=torch.zeros_like(input_ids)
        
        outputs = model(
            input_ids=input_ids.to("cuda"),
            attention_mask=attention_mask.to("cuda"),
            token_type_ids=token_type_ids.to("cuda")
        )
        
        
    return outputs

outputs = test_modernbert(ds)
print("Logits:", outputs.logits)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[0.1258, 0.6800]], device='cuda:0')


In [10]:
training_args = TrainingArguments(
    output_dir = output_dir,
    bf16 = True, # bfloat16 training 
    optim = "adamw_torch_fused",
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = learning_rate,
    logging_dir = f'../bin/logs/modernbert-multirc',
    eval_strategy = "epoch",
    save_strategy = "no", 
    seed = seed,
    log_level = 'error',  
    disable_tqdm = False, 
    report_to = "none", # Disable WandB reporting
) 

trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator = data_collator,
    train_dataset = ds["train"],
    eval_dataset = ds["valid"],
    compute_metrics = compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5175,0.540866,0.785294,0.786131,0.781233,0.78253
2,0.4489,0.63928,0.801225,0.802046,0.797531,0.798832
3,0.3319,0.829146,0.795098,0.79654,0.790833,0.792295
4,0.2746,1.102795,0.798284,0.799092,0.794537,0.795834
5,0.1652,1.543763,0.789706,0.78858,0.788221,0.788388
6,0.0746,1.587519,0.795343,0.796088,0.791579,0.792857


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [11]:
trainer.save_model("../results/modernbert_multirc")

# Quick Test

In [12]:
from transformers import pipeline
 
classifier = pipeline(
    task="text-classification", 
    model="../results/modernbert_multirc",
    tokenizer=model_name_or_path,
    device=0,
)
 
sample = "Smoking is bad for your health."
 
classifier(sample)

[{'label': 'correct', 'score': 0.9971210360527039}]