In [1]:
import os
from pathlib import Path

import numpy as np
import torch
import datasets
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn import metrics

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [2]:
model_name_or_path = "answerdotai/ModernBERT-base"
dataset_path = "../../data/race_multirc_contrastive_pairs.hf"
output_dir = "../../results/modernbert-race-multirc"

batch_size = 4
num_epochs = 8
learning_rate = 3e-5
seed = 42

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [10]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        num_labels=2,
        label2id={"incorrect": 0, "correct": 1},
        id2label={0: "incorrect", 1: "correct"},
    )   
    return model

In [8]:
def preprocess_function(example):
    input_str = f'{example["passage"]}\n\n{example["question"]}\n\n{example["answer"]}'
    return tokenizer(input_str, truncation=True)

dd = datasets.DatasetDict.load_from_disk(dataset_path)
dd = dd.map(
    preprocess_function,
    batched=False,
    remove_columns=[
        "passage", "question", "answer",
        "passage_id", "question_id", "answer_id",
    ],
)

Map:   0%|          | 0/146675 [00:00<?, ? examples/s]

Map:   0%|          | 0/11672 [00:00<?, ? examples/s]

Map:   0%|          | 0/12532 [00:00<?, ? examples/s]

In [9]:
dd

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 146675
    })
    dev: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 11672
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 12532
    })
})

In [11]:
def compute_metrics(eval_pred):
    """    
    Parameters:
    -----------
    eval_pred : tuple
        A tuple of (logits, labels) provided by the Hugging Face Trainer.
        - logits: numpy array of shape (n_samples, 2) for binary classification
        - labels: numpy array of shape (n_samples,)
    
    Returns:
    --------
    dict
        Dictionary containing various metrics:
        - accuracy: Accuracy score
        - precision: Precision score
        - recall: Recall score
        - f1: F1 score
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    # Calculate accuracy
    accuracy = metrics.accuracy_score(labels, preds)
    
    # Calculate precision, recall, f1
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(
        labels, 
        preds, 
        average='macro',
        zero_division=0,
    )
    
    # Return metrics dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [14]:
def test_modernbert(ds):
    """
    Make sure ModernBert model is not returning NaNs.

    If the logits tensor has NaN values, there is a dependency issue.
    """
    model = model_init().to("cuda")

    with torch.no_grad():
        batch = ds["train"][0]
        input_ids = torch.tensor([batch["input_ids"]])
        attention_mask=torch.tensor([batch["attention_mask"]])
        token_type_ids=torch.zeros_like(input_ids)
        
        outputs = model(
            input_ids=input_ids.to("cuda"),
            attention_mask=attention_mask.to("cuda"),
            token_type_ids=token_type_ids.to("cuda")
        )
        
        
    return outputs

outputs = test_modernbert(dd)
print("Logits:", outputs.logits)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[-0.3040,  0.0477]], device='cuda:0')


In [16]:
training_args = TrainingArguments(
    output_dir = output_dir,
    bf16 = True, # bfloat16 training 
    optim = "adamw_torch_fused",
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = learning_rate,
    logging_dir = f'../bin/logs/modernbert-multirc',
    eval_strategy = "epoch",
    save_strategy = "epoch",
    seed = seed,
    log_level = 'error',  
    disable_tqdm = False, 
    report_to = "none", # Disable WandB reporting
) 

trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dd["train"],
    eval_dataset = dd["dev"],
    compute_metrics = compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6956,0.696167,0.476268,0.238134,0.5,0.322616
2,0.6935,0.691129,0.523732,0.261866,0.5,0.343717
3,0.6965,0.694403,0.497515,0.528015,0.51422,0.437131
4,0.6942,0.698193,0.523732,0.261866,0.5,0.343717
5,0.689,0.677412,0.552947,0.550584,0.549655,0.548837
6,0.6794,0.687225,0.560315,0.585556,0.543819,0.489091
7,0.6798,0.712365,0.565541,0.563328,0.560929,0.558779


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

In [17]:
trainer.save_model("../../results/modernbert_race_multirc")

# Quick Test

In [12]:
from transformers import pipeline
 
classifier = pipeline(
    task="text-classification", 
    model="../../results/modernbert_race_multirc",
    tokenizer=model_name_or_path,
    device=0,
)
 
sample = "Smoking is bad for your health."
 
classifier(sample)

[{'label': 'correct', 'score': 0.9971210360527039}]