# Evaluate Finetuned Model Performance

Our current ensemble correlates with the test data at Spearman's r of 0.51. The raw o3-mini scores were correlated at 0.70.

In [12]:
import os

import numpy as np
import pandas as pd
import torch
import datasets
from transformers import pipeline
from sklearn import metrics
from scipy import stats

torch.set_float32_matmul_precision('high')
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [2]:
datadict_path = "../../data/authentic-03-scores-multirc-gpt5-scores.hf" # The prepared training and validation data
model_name_or_path = "answerdotai/ModernBERT-base"
finetuned_model_path = "../../results/modernbert_authentic_multirc"

In [3]:
def format_inputs(example):
    input_str = f'{example["chunk_text"]}\n\n\n{example["question"]}\n\n\n{example["response"]}'
    example["input_str"] = input_str
    return example

dd = datasets.DatasetDict.load_from_disk(datadict_path)
dd = dd.map(
    format_inputs,
    batched=False,
    remove_columns=[
        "chunk_text", "question", "response",
    ],
)

# Convert label column to float type
new_features = dd["train"].features.copy()
new_features["label"] = datasets.Value("float32")
dd = dd.cast(new_features)
dd

DatasetDict({
    train: Dataset({
        features: ['label', 'input_str'],
        num_rows: 5004
    })
    test: Dataset({
        features: ['label', 'input_str'],
        num_rows: 370
    })
    dev: Dataset({
        features: ['label', 'input_str'],
        num_rows: 556
    })
})

In [7]:
from transformers import pipeline

classifier = pipeline(
    task="text-classification", 
    model=finetuned_model_path,
    tokenizer=model_name_or_path,
    device=0,
)
 
results = classifier(dd["test"]["input_str"])

Device set to use cuda:0


In [8]:
labels = dd["test"]["label"]
preds = [round(result["score"]) for result in results]

In [13]:
def calculate_metrics(preds, labels):
    """
    Calculate multiple evaluation metrics using sklearn.
    
    Args:
        preds: List of float predictions
        labels: List of float true labels
    
    Returns:
        Dictionary containing calculated metrics
    """
    preds = np.array(preds)
    labels = np.array(labels)
    
    metric_dict = {}
    
    # Regression metrics
    metric_dict['mse'] = metrics.mean_squared_error(labels, preds)
    metric_dict['rmse'] = metrics.mean_squared_error(labels, preds, squared=False)
    metric_dict['mae'] = metrics.mean_absolute_error(labels, preds)
    metric_dict['r2'] = metrics.r2_score(labels, preds)
    
    # Classification metrics (round to integers for ordinal ratings)
    preds_int = np.round(preds).astype(int)
    labels_int = np.round(labels).astype(int)
    
    # Quadratic Weighted Kappa
    metric_dict['qwk'] = metrics.cohen_kappa_score(labels_int, preds_int, weights='quadratic')
    
    # Linear Weighted Kappa (bonus)
    metric_dict['lwk'] = metrics.cohen_kappa_score(labels_int, preds_int, weights='linear')
    
    # Regular Cohen's Kappa
    metric_dict['kappa'] = metrics.cohen_kappa_score(labels_int, preds_int)

    # Spearman's r
    metric_dict["spearman"] = stats.spearmanr(labels_int, preds_int).statistic

    return metric_dict

metrics = calculate_metrics(preds, labels)

for metric, value in metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

MSE: 0.8054
RMSE: 0.8974
MAE: 0.6432
R2: 0.2939
QWK: 0.5690
LWK: 0.4031
KAPPA: 0.2322
SPEARMAN: 0.5815
