In [20]:
# Upgrade Pytorch & other libraries
%pip install --upgrade --quiet \
    torch torchvision torchaudio \
    transformers accelerate datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: THESE PACKAGES DO NOT MATCH THE HASHES FROM THE REQUIREMENTS FILE. If you have updated the package versions, please update the hashes. Otherwise, examine the package contents carefully; someone may have tampered with them.
    unknown package:
        Expected sha256 7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1
             Got        e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [21]:
import os
from pathlib import Path

import numpy as np
import torch
import datasets
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn import metrics

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [22]:
model_name_or_path = "answerdotai/ModernBERT-base"
dataset_path = "../bin/multirc_dataset.hf"
output_dir = "../bin/modernbert-multirc"

batch_size = 4
num_epochs = 8
learning_rate = 3e-5
seed = 42

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [23]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        num_labels=2,
        label2id={"incorrect": 0, "correct": 1},
        id2label={0: "incorrect", 1: "correct"},
    )   
    return model

In [24]:
def preprocess_function(example):
    return tokenizer(example["text"], truncation=True)

ds = datasets.DatasetDict.load_from_disk(dataset_path)
ds = ds.map(preprocess_function, batched=True, remove_columns=["text"])

In [25]:
ds["test"]["labels"][0:10]

[1, 0, 0, 0, 1, 0, 1, 0, 0, 1]

In [26]:
ds

DatasetDict({
    train: Dataset({
        features: ['index', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 19170
    })
    valid: Dataset({
        features: ['index', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4080
    })
    test: Dataset({
        features: ['index', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3962
    })
})

In [27]:
from transformers import Trainer, AutoModelForSequenceClassification, AutoTokenizer
import torch

model_path = "../results/modernbert_multirc"
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [3]:
import torch
from time import perf_counter

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

from transformers import AutoModelForSequenceClassification

preds = []
times = []
model = AutoModelForSequenceClassification.from_pretrained("../bin/modernbert_classifier").to(device)
# Check classifier path
for text in ds['test']['text']:
    start_time = perf_counter()
    inputs = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    preds.append(model.config.id2label[predicted_class_id])
    times.append(perf_counter() - start_time)

cuda


NameError: name 'ds' is not defined

In [28]:
df = ds['test'].to_pandas()
df['preds']=preds
df['times']=times
df.to_csv('modernbert-results.csv')

In [29]:
labels = []
for x in ds['test']['labels']:
    if x == 1:
        labels.append('correct_answer' )
    else: 
        labels.append('incorrect_answer') 

from sklearn import metrics
from matplotlib import pyplot as plt

print(metrics.classification_report(labels, preds))

                  precision    recall  f1-score   support

         LABEL_0       0.00      0.00      0.00       0.0
  correct_answer       0.00      0.00      0.00    1722.0
incorrect_answer       0.00      0.00      0.00    2240.0

        accuracy                           0.00    3962.0
       macro avg       0.00      0.00      0.00    3962.0
    weighted avg       0.00      0.00      0.00    3962.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Alternative

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from time import perf_counter

model_path = "../results/modernbert_checkpoints/checkpoint-38344"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

print(f"Model config: {model.config}")
print(f"Number of labels: {model.config.num_labels}")
print(f"Problem type: {model.config.problem_type}")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

raw_preds = []
pred_labels = []
true_labels = []
inference_times = []

print(f"Running inference on test set...")
for i, example in enumerate(ds['test']):
    
    true_label = int(example['labels'])
    true_labels.append(true_label)
    
    start_time = perf_counter()
    
    inputs = tokenizer(example['text'], return_tensors='pt', truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    inference_time = perf_counter() - start_time
    inference_times.append(inference_time)
    
    logits = outputs.logits.cpu().numpy().flatten()
    print(logits)
    raw_preds.append(logits)
    

    pred_label = np.argmax(logits, axis=0)
    
    pred_labels.append(pred_label)
    
    if i < 5:
        print(f"\nExample {i}:")
        print(f"Text: {example['text'][:100]}...")
        print(f"True label: {true_label}")
        print(f"Raw prediction: {logits}")
        print(f"Predicted label: {pred_label}")

true_labels = np.array(true_labels)
pred_labels = np.array(pred_labels)
raw_preds = np.array(raw_preds)

print("\nPrediction distribution:")
print(f"Unique predicted labels: {np.unique(pred_labels, return_counts=True)}")
print(f"Unique true labels: {np.unique(true_labels, return_counts=True)}")

avg_time = sum(inference_times) / len(inference_times)
print(f"\nAverage inference time per example: {avg_time:.4f} seconds")
print(f"Total inference time: {sum(inference_times):.2f} seconds")

if len(np.unique(pred_labels)) > 1 and len(np.unique(true_labels)) > 1:
    accuracy = metrics.accuracy_score(true_labels, pred_labels)
    precision = metrics.precision_score(true_labels, pred_labels, zero_division=0)
    recall = metrics.recall_score(true_labels, pred_labels, zero_division=0)
    f1 = metrics.f1_score(true_labels, pred_labels, zero_division=0)
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    cm = metrics.confusion_matrix(true_labels, pred_labels)
    print(f"Confusion Matrix:\n{cm}")
    
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['incorrect', 'correct'])
    cm_display.plot()
    plt.title("Predicted and True Classifications (ModernBERT)")
    plt.show()
    
    print("\nClassification Report:")
    print(metrics.classification_report(true_labels, pred_labels, 
                                       target_names=['incorrect_answer', 'correct_answer']))
else:
    print("\nWARNING: Cannot calculate metrics - predictions or true labels are all the same value")
    print(f"All predictions: {pred_labels[0]}")
    print(f"Raw prediction examples: {raw_preds[:5]}")

results_df = pd.DataFrame({
    'text': [ex['text'] for ex in ds['test']],
    'true_label': true_labels,
    'predicted_label': pred_labels,
    'raw_prediction': [p[0] for p in raw_preds],
    'inference_time': inference_times
})
results_df.to_csv('modernbert_debug_results.csv')