# Check performance

Using authentic data that was labeled by humans on our team

In [1]:
import os
import re
from collections import defaultdict

import torch
import numpy as np
import pandas as pd
import datasets
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score

from models import Bleurt, Mpnet, ModernBERT

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [2]:
pipes = {
    "Mpnet_pipe":    Mpnet(),
    "Bleurt_pipe":   Bleurt(),
    "ModernBERT_pipe": ModernBERT()
}

Device set to use cuda
Device set to use cuda
Device set to use cuda


In [2]:
dataset_path = "/home/jovyan/active-projects/itell-question-generation/data/cri_annotations.hf"

ds = datasets.Dataset.load_from_disk(dataset_path)
ds

Dataset({
    features: ['question_id', 'chunk_header', 'text', 'question', 'reference', 'candidate', 'score', 'annotator', 'labels'],
    num_rows: 370
})

In [5]:
def score(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro":  f1_score(y_true, y_pred, average="macro")
    }

def evaluate_and_score(dataset, pipes, label_key="labels"):
    all_preds = defaultdict(list)

    for name, pipe in pipes.items():
        for ex in dataset:
            pred = pipe(ex["candidate"], ex["reference"])  
            all_preds[name].append(pred)

    metrics = {
        name: score(dataset["labels"], all_preds[name])
        for name in pipes
    }

    return metrics

results = evaluate_and_score(ds, pipes)

for model_name, m in results.items():
    print(f"{model_name}:  acc={m['accuracy']:.4f}, f1_macro={m['f1_macro']:.4f}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (217 > 128). Running this sequence through the model will result in indexing errors


Mpnet_pipe:  acc=0.7297, f1_macro=0.6656
Bleurt_pipe:  acc=0.6919, f1_macro=0.6232
ModernBERT_pipe:  acc=0.7405, f1_macro=0.6690
