In [None]:
# !pip install matplotlib

import os
import torch
import pyarrow

import numpy as np
import pandas as pd
import datasets
from transformers import pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [3]:
# !pip install accelerate

In [None]:
from compare_models_tobasum import Bleurt, Mpnet, ModernBERT

In [5]:
bleurt = Bleurt()
mpnet = Mpnet()
modernbert = ModernBERT()

Device set to use cuda
Device set to use cuda
Device set to use cuda


In [6]:
dataset_path = "../bin/multirc_dataset.hf"

ds = datasets.DatasetDict.load_from_disk(dataset_path)

In [7]:
# Test reference answer normalization
for sample in ds["test"]:
    try:
        response, reference = sample["text"].split("</s>")
        reference = reference.strip().removeprefix("Answer:").strip()
    except:
        print(sample["text"])

In [8]:
def get_defined_models():
    
    return {
        "ModernBERT": modernbert,
        "MPNet": mpnet,
        "BLEURT": bleurt
    }

In [9]:
models = get_defined_models()

In [10]:
pipes = {
    "Mpnet_pipe":    Mpnet(),
    "Bleurt_pipe":   Bleurt(),
    "ModernBERT_pipe": ModernBERT()
}

def evaluate_and_score(dataset, pipes, label_key="labels"):
  
    texts, true_labels = [], []
    all_preds = { name: [] for name in pipes }

    for ex in dataset["test"]:
        text  = ex["text"]
        label = ex[label_key]

        texts.append(text)
        true_labels.append(label)

        for name, pipe in pipes.items():
            
            pred = pipe(text, ex.get("reference", None))  
            all_preds[name].append(pred)

    df = pd.DataFrame({
        "input_text":  texts,
        "true_label":  true_labels,
        **{ f"{n}_pred": preds for n, preds in all_preds.items() }
    })

    def score(col):
        return {
            "accuracy": accuracy_score(df["true_label"], df[col]),
            "f1_macro":  f1_score(df["true_label"], df[col], average="macro")
        }

    metrics = {
        name: score(f"{name}_pred")
        for name in pipes
    }

    return df, metrics

df_predictions_multirc, results = evaluate_and_score(ds, pipes)

# df_predictions_multirc.to_csv("predicted_labels_all_models.csv", index=False)
print(df_predictions_multirc.head(), "\n")

for model_name, m in results.items():
    print(f"{model_name}:  acc={m['accuracy']:.4f}, f1_macro={m['f1_macro']:.4f}")


Device set to use cuda
Device set to use cuda
Device set to use cuda
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (214 > 128). Running this sequence through the model will result in indexing errors


                                          input_text  true_label  \
0  The Women's Haven of Tarrant County</s>\nAnswe...           1   
1  Tarrant county shelters</s>\nAnswer: The Women...           0   
2  Female's Safe House of Haven County</s>\nAnswe...           0   
3                      California</s>\nAnswer: Texas           0   
4                           Texas</s>\nAnswer: Texas           1   

   Mpnet_pipe_pred  Bleurt_pipe_pred  ModernBERT_pipe_pred  
0                1                 1                     0  
1                0                 0                     0  
2                0                 1                     0  
3                0                 0                     0  
4                1                 1                     0   

Mpnet_pipe:  acc=0.8107, f1_macro=0.8050
Bleurt_pipe:  acc=0.6287, f1_macro=0.5360
ModernBERT_pipe:  acc=0.5654, f1_macro=0.3612


In [None]:
models = list(results.keys())
accuracies = [results[m]["accuracy"] for m in models]
f1s        = [results[m]["f1_macro"] for m in models]

x = range(len(models))
width = 0.35

plt.figure()
plt.bar([xi - width/2 for xi in x], accuracies, width)
plt.bar([xi + width/2 for xi in x], f1s, width)
plt.xticks(x, models, rotation=30)
plt.ylabel("Score")
plt.title("Accuracy and F1 (Macro) by Model")
plt.legend(["Accuracy", "F1 (Macro)"])
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

labels = sorted(df_predictions_multirc["true_label"].unique())

for name in models:
    cm = confusion_matrix(
        df_predictions_multirc["true_label"],
        df_predictions_multirc[f"{name}_pred"],
        labels=labels
    )
    plt.figure()
    plt.imshow(cm, aspect='auto')
    plt.xticks(range(len(labels)), labels, rotation=45)
    plt.yticks(range(len(labels)), labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix: {name}")
    plt.colorbar()
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.metrics import classification_report

def generate_classification_reports(df, models):
    
    reports = {}

    for model in models:
        preds = df[f"{model}_pred"]
        true_labels = df["true_label"]

        report = classification_report(true_labels, preds, output_dict=True)
        reports[model] = report

        print(f"\nClassification Report for {model}:\n")
        print(classification_report(true_labels, preds))

    return reports

classification_reports = generate_classification_reports(df_predictions_multirc, models)


In [21]:
# HUMAN-SCORED METRICS

# pip install openpyxl
import re

sheet_id   = "1qZc2b8wWlIRhxDr6Z9r6tzOcLsBrLXmKBvj4zp7-qHU"
export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"

all_sheets = pd.read_excel(export_url, sheet_name=None)

filtered = {
    name: df for name, df in all_sheets.items()
    if not re.fullmatch(r"\d+", name)
}

dfs = []
for name, df in filtered.items():
    d = df.copy()
    d["sheet_name"] = name
    dfs.append(d)
combined = pd.concat(dfs, ignore_index=True)

# rename and binarize
combined = combined.rename(
    columns={"reference_answer": "reference", "chunk_text": "text", "score (1-4)": "labels"}
)
combined["labels"] = (combined["labels"] >= 3).astype(int)

sheet_human_scored = {
    "test": combined.to_dict(orient="records")
}

# check label space
print("\nUnique labels after binarization:", combined["labels"].unique())



Unique labels after binarization: [0 1]


In [None]:
human_scored_preds, sheet_metrics = evaluate_and_score(sheet_human_scored, pipes)

print(human_scored_preds.head(), "\n")

for model_name, m in sheet_metrics.items():
    print(f"{model_name}:  acc={m['accuracy']:.4f}, f1_macro={m['f1_macro']:.4f}")

In [None]:
models = list(sheet_metrics.keys())
accuracies = [sheet_metrics[m]["accuracy"] for m in models]
f1s        = [sheet_metrics[m]["f1_macro"] for m in models]

x = range(len(models))
width = 0.35

plt.figure()
plt.bar([xi - width/2 for xi in x], accuracies, width)
plt.bar([xi + width/2 for xi in x], f1s, width)
plt.xticks(x, models, rotation=30)
plt.ylabel("Score")
plt.title("Accuracy and F1 (Macro) by Model")
plt.legend(["Accuracy", "F1 (Macro)"])
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

labels = sorted(human_scored_preds["true_label"].unique())

for name in models:
    cm = confusion_matrix(
        df_predictions_multirc["true_label"],
        df_predictions_multirc[f"{name}_pred"],
        labels=labels
    )
    plt.figure()
    plt.imshow(cm, aspect='auto')
    plt.xticks(range(len(labels)), labels, rotation=45)
    plt.yticks(range(len(labels)), labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix: {name}")
    plt.colorbar()
    plt.tight_layout()
    plt.show()


In [None]:
classification_reports_humanscored = generate_classification_reports(human_scored_preds, models)