In [2]:
!pip install transformers

import os
import torch
import pyarrow

import numpy as np
import pandas as pd
import datasets
from transformers import pipeline
import matplotlib.pyplot as plt
from sklearn import metrics

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Using cached transformers-4.49.0-py3-none-any.whl (10.0 MB)
Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (792 kB)
Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: safetensors, regex, tokenizers, transformers
Succ

In [5]:
model_name_or_path_modernbert = "answerdotai/ModernBERT-base"
model_name_or_path_mpnet = "microsoft/mpnet-base"

In [6]:
pipe_modernbert = pipeline(
    task="text-classification", 
    model="../results/modernbert_multirc",
    tokenizer=model_name_or_path_modernbert,
    device=0,
)
 
sample = "Smoking is bad for your health."
 
pipe_modernbert(sample)

Device set to use cuda:0


[{'label': 'correct', 'score': 0.9971206188201904}]

In [7]:
pipe_mpnet = pipeline(
    task="text-classification", 
    model="../bin/mpnet_classifier",
    tokenizer=model_name_or_path_mpnet,
    device=0,
)
 
sample = "Smoking is bad for your health."
 
pipe_mpnet(sample)

Device set to use cuda:0


[{'label': 'incorrect_answer', 'score': 0.696195662021637}]

In [None]:
dataset_path = "../bin/multirc_dataset.hf"

ds = datasets.DatasetDict.load_from_disk(dataset_path)

In [None]:
def evaluate_model(pipe, dataset, label_key="label"):
    preds, labels = [], []
    
    for example in dataset["test"]:
        text = example["text"]
        label = example[label_key]
        
        result = pipe(text)
        pred_label = int(result[0]["label"][-1])  # Assuming label format like "LABEL_0" or "LABEL_1"
        
        preds.append(pred_label)
        labels.append(label)
    
    accuracy = metrics.accuracy_score(labels, preds)
    f1 = metrics.f1_score(labels, preds, average="weighted")
    return accuracy, f1

In [None]:
modernbert_metrics = evaluate_model(modernbert_pipe, ds)
mpnet_metrics = evaluate_model(mpnet_pipe, ds)

In [None]:
print("ModernBERT Metrics:")
print(f"Accuracy: {modernbert_metrics[0]:.4f}, F1 Score: {modernbert_metrics[1]:.4f}")

print("\nMPNet Metrics:")
print(f"Accuracy: {mpnet_metrics[0]:.4f}, F1 Score: {mpnet_metrics[1]:.4f}")

In [None]:
labels = ["Accuracy", "F1 Score"]
modernbert_values = list(modernbert_metrics)
mpnet_values = list(mpnet_metrics)

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, modernbert_values, width, label='ModernBERT')
rects2 = ax.bar(x + width/2, mpnet_values, width, label='MPNet')

ax.set_ylabel("Score")
ax.set_title("Model Performance Comparison")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()