# Train ModernBERT on Pseudo-labeled Data

Pseudo-labeled data comes from two sources:
- MultiRC, pseudo-labeled by GPT5
- Authentic iTELL data, pseudo-labeled by o3-mini

Humans have labeled a non-overlapping portion of the authentic iTELL data. This will be our held-out test set.

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import datasets
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn import metrics

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [2]:
model_name_or_path = "answerdotai/ModernBERT-base"
output_dir = "../../results/modernbert-multirc-pseudo-labeled"

# Training/Validation Data:
datadict_path = "../../data/authentic-03-scores-multirc-gpt5-scores.hf" # The prepared training and validation data
multirc_path = "../../data/multirc-data-w-gpt5-scores.csv" # A subsample of MultiRC, scored by GPT 5
authentic_path = "../../data/authentic_train_data.csv" # Authentic data from iTELL, scored by o3-mini using the same rubric/prompt

# Test Data:
test_data_path = "../../data/authentic_test_data.csv" # Authentic data from iTELL, scored by the iTELL development team

batch_size = 4
num_epochs = 8
learning_rate = 3e-5
seed = 42

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

## Construct Dataset

In [3]:
train_dev_df1 = (
    pd.read_csv(multirc_path)
    [["chunk_text", "question", "response", "gpt5_score"]]
    .rename(columns={"gpt5_score": "label"})
)
train_dev_df2 = (
    pd.read_csv(authentic_path)
    [["chunk_text", "question", "response", "o3_mini_score"]]
    .rename(columns={"o3_mini_score": "label"})
)
test_df = (
    pd.read_csv(test_data_path)
    [["chunk_text", "question", "response", "human_score"]]
    .rename(columns={"human_score": "label"})
)

train_dev_df = pd.concat([train_dev_df1, train_dev_df2])
train_dev_df

Unnamed: 0,chunk_text,question,response,label
0,A flood occurs when a river overflows its bank...,What forms the raised strip near the edge of a...,Sandy desert,1
1,Force is a vector. What then is a vector? Thin...,What two pieces of information does a vector p...,Motion and distance,2
2,"Madrid, Spain (CNN) -- Relatives of a woman ki...",Where was the Spanish MD82 bound for when the ...,Spain's Barcelona,1
3,Flowing water causes sediment to move. Flowing...,How long does it take for water to dissolve ro...,Few days,1
4,How would the universe look without gravity? I...,How would the universe look without gravity?,No planets,2
...,...,...,...,...
1053,Let’s begin with a brief overview of spectacul...,What were economic conditions like before 1870?,"Slow technological progress, natural disasters...",4
1054,Let’s begin with a brief overview of spectacul...,What were economic conditions like before 1870?,Economic conditions before 1870 were sluggish ...,2
1055,Let’s begin with a brief overview of spectacul...,What were economic conditions like before 1870?,economic conditions were slow,2
1056,Let’s begin with a brief overview of spectacul...,What were economic conditions like before 1870?,"Before 1870, economic conditions were relative...",2


In [4]:
train_dev_ds = datasets.Dataset.from_pandas(train_dev_df, preserve_index=False)
dd = train_dev_ds.train_test_split(test_size=0.10, seed=42)
dd["dev"] = dd["test"]

test_ds = datasets.Dataset.from_pandas(test_df, preserve_index=False)
dd["test"] = test_ds
dd

DatasetDict({
    train: Dataset({
        features: ['chunk_text', 'question', 'response', 'label'],
        num_rows: 5004
    })
    test: Dataset({
        features: ['chunk_text', 'question', 'response', 'label'],
        num_rows: 370
    })
    dev: Dataset({
        features: ['chunk_text', 'question', 'response', 'label'],
        num_rows: 556
    })
})

In [5]:
dd.save_to_disk(datadict_path)

Saving the dataset (0/1 shards):   0%|          | 0/5004 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/370 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/556 [00:00<?, ? examples/s]

## Prepare Dataset

In [14]:
def preprocess_function(example):
    input_str = f'{example["chunk_text"]}\n\n\n{example["question"]}\n\n\n{example["response"]}'
    new_example = tokenizer(input_str)
    # new_example["label"] = example["label"] - 1 # Rescale labels to [0, 3] range
    return tokenizer(input_str)

dd = datasets.DatasetDict.load_from_disk(datadict_path)
dd = dd.map(
    preprocess_function,
    batched=False,
    remove_columns=[
        "chunk_text", "question", "response",
    ],
)

# Convert label column to float type
new_features = dd["train"].features.copy()
new_features["label"] = datasets.Value("float32")
dd = dd.cast(new_features)
dd

Casting the dataset:   0%|          | 0/5004 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/370 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/556 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5004
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 370
    })
    dev: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 556
    })
})

## Set Up Training

In [15]:
label2id = {
    "Distracted": 0,
    "Borderline": 1,
    "Proficient": 2,
    "Expert": 3,
}
id2label = {v: k for k, v in label2id.items()}

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        num_labels=1,
        # label2id=label2id,
        # id2label=id2label,
    )   
    return model

In [16]:
def compute_metrics(eval_pred):
    """    
    Parameters:
    -----------
    eval_pred : tuple
        A tuple of (logits, labels) provided by the Hugging Face Trainer.
        - logits: numpy array of shape (n_samples, 2) for binary classification
        - labels: numpy array of shape (n_samples,)
    
    Returns:
    --------
    dict
        Dictionary containing:
        - accuracy: Accuracy score
        - precision: Precision score
        - recall: Recall score
        - f1: F1 score
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    # Calculate accuracy
    accuracy = metrics.accuracy_score(labels, preds)
    
    # Calculate precision, recall, f1
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(
        labels, 
        preds, 
        average='macro',
        zero_division=0,
    )
    
    # Return metrics dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [None]:
training_args = TrainingArguments(
    output_dir = output_dir,
    bf16 = True, # bfloat16 training 
    optim = "adamw_torch_fused",
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = learning_rate,
    logging_dir = "../../logs",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    seed = seed,
    log_level = 'error',  
    disable_tqdm = False, 
    report_to = "none", # Disable WandB reporting
) 

trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dd["train"],
    eval_dataset = dd["dev"],
    compute_metrics = compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model("../../results/modernbert_authentic_multirc")

## Test

In [None]:
from transformers import pipeline
 
classifier = pipeline(
    task="text-classification", 
    model="../../results/modernbert_authentic_multirc",
    tokenizer=model_name_or_path,
    device=0,
)
 
sample = "Smoking is bad for your health."
 
classifier(sample)