In [94]:
import re

from tqdm.auto import tqdm
import pandas as pd
import torch
import datasets
from transformers import pipeline, AutoTokenizer

torch.set_float32_matmul_precision('high')

In [120]:
dataset_path = '../../bin/multirc_dataset.hf'
ds = datasets.DatasetDict.load_from_disk(dataset_path)["test"]
ds

Dataset({
    features: ['index', 'text', 'labels'],
    num_rows: 3962
})

In [123]:
def split_samples(example):
    """Split strings into reference/answer components,
    so models can join them together differently."""
    candidate, reference = example["text"].split("</s>")
    reference = candidate.strip().removeprefix("Answer:").strip()
    example["answer"] = reference
    example["response"] = candidate
    return example

ds = ds.map(split_samples, remove_columns=["index", "text"])

In [124]:
class Bleurt():
    model_name = "vaiibhavgupta/finetuned-bleurt-large"
    threshold = 0.7

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            device="cuda",
        )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        
        sequence = f"{candidate}[SEP]{reference}"

        result = self.classifier(sequence)
        score = result[0]["score"]

        return 1 if score > self.threshold else 0

In [125]:
class Mpnet():
    model_name = "tiedaar/short-answer-classification"
    revision = "77b846ec4606bfcfdf913888d7f0ab51f977a579"

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            revision=self.revision,
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        
        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct_answer" else 0

In [113]:
class ModernBERT():
    model_name = "../../results/modernbert_multirc/"
    tokenizer_name = "answerdotai/ModernBERT-base"

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            tokenizer=AutoTokenizer.from_pretrained(self.tokenizer_name),
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")

        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct" else 0

In [127]:
# NOTE: these classes are not designed to take advantage of Pipeline's batching optimizations.

pipe_dict = {
    "Mpnet": Mpnet(),
    "Bleurt": Bleurt(),
    "ModernBERT": ModernBERT(),
}

Device set to use cuda
Device set to use cuda
Device set to use cuda


In [128]:
def evaluate_all_models(dataset, pipe_dict):
    pred_dict = {name: [] for name in pipe_dict.keys()}

    for name, pipe in tqdm(pipe_dict.items(), total=len(pipe_dict)):
        for example in tqdm(dataset, total=len(dataset)):
            pred_dict[name].append(pipe(example))

    return pd.DataFrame(pred_dict)

df = evaluate_all_models(ds, pipe_dict)
df["labels"] = ds["labels"]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3962 [00:00<?, ?it/s]

  0%|          | 0/3962 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (209 > 128). Running this sequence through the model will result in indexing errors


  0%|          | 0/3962 [00:00<?, ?it/s]

In [132]:
df.to_csv("../../data/multirc-dataset-preds.csv", index=False)

In [129]:
df["ensemble"] = (
    (df["Mpnet"] == 1)
    | (df["Bleurt"] == 1)
).astype(int)

df[[
    "labels",
    "ensemble",
    "Mpnet",
    "Bleurt",
    "ModernBERT",
]].corr(method="spearman")

Unnamed: 0,labels,ensemble,Mpnet,Bleurt,ModernBERT
labels,1.0,0.584287,0.612339,0.569871,0.572411
ensemble,0.584287,1.0,0.836725,0.921361,0.723595
Mpnet,0.612339,0.836725,1.0,0.74182,0.724845
Bleurt,0.569871,0.921361,0.74182,1.0,0.710561
ModernBERT,0.572411,0.723595,0.724845,0.710561,1.0


In [133]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
)

def get_metrics(df, model_names):
    metrics = []
    
    for model_name in model_names:
        preds = df[model_name]

        acc = accuracy_score(df["labels"], preds)
        p, r, f1, _ = precision_recall_fscore_support(
            df['labels'],
            preds,
            average="binary",
            pos_label=1
        )
        
        metrics.append({
            'Model': model_name,
            'Accuracy': acc,
            'Precision': p,
            'Recall': r,
            'F1-Score': f1
        })

    return pd.DataFrame(metrics)

model_names = ["ensemble", "Mpnet", "Bleurt", "ModernBERT"]
get_metrics(df, model_names).round(3)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,ensemble,0.789,0.722,0.838,0.775
1,Mpnet,0.81,0.794,0.761,0.777
2,Bleurt,0.786,0.738,0.788,0.762
3,ModernBERT,0.79,0.754,0.765,0.76
