In [42]:
!pip install evaluate

import os
import numpy as np
from pathlib import Path
import argparse
import logging
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, Value
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)
import evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [43]:
model_name_or_path = "answerdotai/ModernBERT-base"
dataset_path = "../bin/multirc_dataset.hf"
output_dir = "../results/modernbert-training"

batch_size = 4
num_epochs = 8
learning_rate = 3e-5
seed = 42
metric = 'accuracy'

id2label = {0: "incorrect_answer", 1: "correct_answer"}
label2id = {"incorrect_answer": 0, "correct_answer": 1}

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path)

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=1)   
    return model

In [44]:
def preprocess_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

ds = DatasetDict.load_from_disk(dataset_path)
ds = ds.map(preprocess_function, batched=False)
ds = ds.cast_column("labels", Value("float32"))

In [45]:
ds

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 19170
    })
    valid: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4080
    })
    test: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3962
    })
})

In [46]:
ds["test"].features

{'index': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'labels': Value(dtype='float32', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [47]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

In [None]:
trainer = Trainer(
    model_init=model_init,
)

training_args = TrainingArguments(
    output_dir = "../results/modernbert_checkpoints", # Modify path
    optim = 'adamw_torch',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    # weight_decay = 0.01, # Leave at default value for this model? Ask Wes.
    learning_rate = learning_rate,
    logging_dir = f'../bin/logs/content', # Modify path
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    eval_strategy = "epoch",
    save_strategy = "epoch", 
    greater_is_better = True,
    seed=seed,
    log_level = 'error',  
    disable_tqdm = False, 
    report_to = "none", # Disable WandB reporting
) 

trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator=data_collator,
    train_dataset = ds['train'],
    eval_dataset = ds['valid'],
    compute_metrics = compute_metrics
)

trainer.train()

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
#Alternative

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from time import perf_counter

model_path = "../results/modernbert_checkpoints/checkpoint-38344"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

print(f"Model config: {model.config}")
print(f"Number of labels: {model.config.num_labels}")
print(f"Problem type: {model.config.problem_type}")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

raw_preds = []
pred_labels = []
true_labels = []
inference_times = []

print(f"Running inference on test set...")
for i, example in enumerate(ds['test']):
    
    true_label = int(example['labels'])
    true_labels.append(true_label)
    
    start_time = perf_counter()
    
    inputs = tokenizer(example['text'], return_tensors='pt', truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    inference_time = perf_counter() - start_time
    inference_times.append(inference_time)
    
    logits = outputs.logits.cpu().numpy().flatten()
    print(logits)
    raw_preds.append(logits)
    

    pred_label = np.argmax(logits, axis=0)
    
    pred_labels.append(pred_label)
    
    if i < 5:
        print(f"\nExample {i}:")
        print(f"Text: {example['text'][:100]}...")
        print(f"True label: {true_label}")
        print(f"Raw prediction: {logits}")
        print(f"Predicted label: {pred_label}")

true_labels = np.array(true_labels)
pred_labels = np.array(pred_labels)
raw_preds = np.array(raw_preds)

print("\nPrediction distribution:")
print(f"Unique predicted labels: {np.unique(pred_labels, return_counts=True)}")
print(f"Unique true labels: {np.unique(true_labels, return_counts=True)}")

avg_time = sum(inference_times) / len(inference_times)
print(f"\nAverage inference time per example: {avg_time:.4f} seconds")
print(f"Total inference time: {sum(inference_times):.2f} seconds")

if len(np.unique(pred_labels)) > 1 and len(np.unique(true_labels)) > 1:
    accuracy = metrics.accuracy_score(true_labels, pred_labels)
    precision = metrics.precision_score(true_labels, pred_labels, zero_division=0)
    recall = metrics.recall_score(true_labels, pred_labels, zero_division=0)
    f1 = metrics.f1_score(true_labels, pred_labels, zero_division=0)
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    cm = metrics.confusion_matrix(true_labels, pred_labels)
    print(f"Confusion Matrix:\n{cm}")
    
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['incorrect', 'correct'])
    cm_display.plot()
    plt.title("Predicted and True Classifications (ModernBERT)")
    plt.show()
    
    print("\nClassification Report:")
    print(metrics.classification_report(true_labels, pred_labels, 
                                       target_names=['incorrect_answer', 'correct_answer']))
else:
    print("\nWARNING: Cannot calculate metrics - predictions or true labels are all the same value")
    print(f"All predictions: {pred_labels[0]}")
    print(f"Raw prediction examples: {raw_preds[:5]}")

results_df = pd.DataFrame({
    'text': [ex['text'] for ex in ds['test']],
    'true_label': true_labels,
    'predicted_label': pred_labels,
    'raw_prediction': [p[0] for p in raw_preds],
    'inference_time': inference_times
})
results_df.to_csv('modernbert_debug_results.csv')

In [57]:
!pip install "transformers[torch]>=4.35.0"
!pip install "accelerate>=0.26.0"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [60]:
from transformers import Trainer, AutoModelForSequenceClassification, AutoTokenizer
import torch

model_path = "../results/modernbert_checkpoints/checkpoint-38344"
model = AutoModelForSequenceClassification.from_pretrained(model_path)


In [61]:
import time

start_time = time.time()
preds, labels, metrics= trainer.predict(ds['test'])
predictions = np.argmax(preds, axis=1)
end_time = time.time()
print(end_time-start_time)

NameError: name 'trainer' is not defined

In [62]:
from sklearn import metrics
from matplotlib import pyplot as plt

confusion_matrix = metrics.confusion_matrix(labels, predictions)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['incorrect', 'correct'])

cm_display.plot()
plt.title('Predicted and True Classifications of Correct and Incorrect Answers (ModernBERT)')
plt.show()

NameError: name 'labels' is not defined

In [63]:
print(metrics.classification_report(labels, predictions))

NameError: name 'labels' is not defined

In [64]:
trainer.save_model("../bin/modernbert_classifier") # Modify path - save in summary scoring bin

NameError: name 'trainer' is not defined

In [65]:
import sklearn
sklearn.metrics.cohen_kappa_score(labels, predictions)

NameError: name 'labels' is not defined

In [66]:
import torch
from time import perf_counter

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

from transformers import AutoModelForSequenceClassification

preds = []
times = []
model = AutoModelForSequenceClassification.from_pretrained("../bin/modernbert_classifier").to(device)
# Check classifier path
for text in ds['test']['text']:
    start_time = perf_counter()
    inputs = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    preds.append(model.config.id2label[predicted_class_id])
    times.append(perf_counter() - start_time)


cuda


In [67]:
df = ds['test'].to_pandas()
df['preds']=preds
df['times']=times
df.to_csv('modernbert-results.csv')

In [68]:
labels = []
for x in ds['test']['labels']:
    if x == 1:
        labels.append('correct_answer' )
    else: 
        labels.append('incorrect_answer') 

from sklearn import metrics
from matplotlib import pyplot as plt

print(metrics.classification_report(labels, preds))

                  precision    recall  f1-score   support

         LABEL_0       0.00      0.00      0.00       0.0
  correct_answer       0.00      0.00      0.00    1722.0
incorrect_answer       0.00      0.00      0.00    2240.0

        accuracy                           0.00    3962.0
       macro avg       0.00      0.00      0.00    3962.0
    weighted avg       0.00      0.00      0.00    3962.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("short-answer-classification")
tokenizer.push_to_hub("short-answer-classification")

In [None]:
from datasets import load_dataset
import os

dataset_path = "active-projects/textbook-question-generation/data/multirc-v2"

dataset_files = {
    "train": os.path.join(dataset_path, "train_456-fixedlds.json"),
    "valid": os.path.join(dataset_path, "dev_83-fixedlds.json"),
}

ds = load_dataset("json", data_files=dataset_files)

ds = ds.map(preprocess_function, batched=False)

In [None]:
ds

In [None]:
citation = """
@misc{modernbert,
      title={Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference}, 
      author={Benjamin Warner and Antoine Chaffin and Benjamin Clavié and Orion Weller and Oskar Hallström and Said Taghadouini and Alexis Gallagher and Raja Biswas and Faisal Ladhak and Tom Aarsen and Nathan Cooper and Griffin Adams and Jeremy Howard and Iacopo Poli},
      year={2024},
      eprint={2412.13663},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2412.13663}
}
"""