In [1]:
from tqdm.auto import tqdm
import pandas as pd
import torch
import datasets
from transformers import pipeline, AutoTokenizer

torch.set_float32_matmul_precision('high')

In [32]:
df = pd.read_csv("../../data/itell-question-data-w-o3-scores.csv")
display(df.sample(2))

ds = datasets.Dataset.from_pandas(df)
ds

Unnamed: 0,id,response,score,condition,user_id,page_slug,chunk_slug,created_at,volume_slug,volume_title,page_title,chunk_header,chunk_text,question,answer,o3_mini_score
1266,4d5f8f27-140c-4821-afd4-033116e217d7,the purpose of learning analytics dashboards i...,2,,cluass5es0000k00f92fxztbx,learning-an-1,"What-For,-Whom,-Why,-How?-473t",2024-04-20 19:00:22.664,cornell,Handbook of Learning Analytics,Learning Analytics Dashboard,"What For, Whom, Why, How?",What follows is a non-exhaustive overview; it ...,What are the main purposes of learning analyti...,"The main purposes are to promote awareness, re...",2
152,2c3bbc12-a1fc-4b75-ab10-1ba760a0c056,"According to visualization experts, the step o...",2,,clud1801v0000jv0f6vzbe06v,learning-an-1,Acquire-and-Pre-Process-Your-Data-533t,2024-03-29 21:04:40.135,cornell,Handbook of Learning Analytics,Learning Analytics Dashboard,Acquire and (Pre-)Process Your Data,Building a visual dashboard typically entails ...,What step of building a visual dashboard typic...,Data-gathering and preprocessing step.,4


Dataset({
    features: ['id', 'response', 'score', 'condition', 'user_id', 'page_slug', 'chunk_slug', 'created_at', 'volume_slug', 'volume_title', 'page_title', 'chunk_header', 'chunk_text', 'question', 'answer', 'o3_mini_score'],
    num_rows: 1428
})

In [33]:
class Bleurt():
    model_name = "vaiibhavgupta/finetuned-bleurt-large"
    threshold = 0.7

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            device="cuda",
        )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        
        sequence = f"{candidate}[SEP]{reference}"

        result = self.classifier(sequence)
        score = result[0]["score"]

        return 1 if score > self.threshold else 0

In [34]:
class Mpnet():
    model_name = "tiedaar/short-answer-classification"
    revision = "77b846ec4606bfcfdf913888d7f0ab51f977a579"

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            revision=self.revision,
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        
        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct_answer" else 0

In [35]:
class ModernBERT():
    model_name = "../../results/modernbert_multirc/"
    tokenizer_name = "answerdotai/ModernBERT-base"

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            tokenizer=AutoTokenizer.from_pretrained(self.tokenizer_name),
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct" else 0

In [36]:
class AugModernBert():
    """Augmented ModernBERT. Trained on mix of RACE and MultiRC"""
    model_name = "../../results/modernbert_race_multirc"
    tokenizer_name = "answerdotai/ModernBERT-base"
    
    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            tokenizer=AutoTokenizer.from_pretrained(self.tokenizer_name),
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        passage = input_dict.get("chunk_text", "")
        question = input_dict.get("question", "")
        candidate = input_dict.get("response", "")
        
        sequence = f'{passage}\n\n{question}\n\n{candidate}'

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct" else 0

In [37]:
# NOTE: these classes are not designed to take advantage of Pipeline's batching optimizations.

pipe_dict = {
    "Mpnet": Mpnet(),
    "Bleurt": Bleurt(),
    "ModernBERT": ModernBERT(),
    "AugmentedModernBERT": AugModernBert(),
}

Device set to use cuda
Device set to use cuda
Device set to use cuda
Device set to use cuda


In [38]:
def evaluate_all_models(dataset, pipe_dict):
    pred_dict = {name: [] for name in pipe_dict.keys()}

    for name, pipe in tqdm(pipe_dict.items(), total=len(pipe_dict)):
        for example in tqdm(dataset, total=len(dataset)):
            pred_dict[name].append(pipe(example))

    return pd.DataFrame(pred_dict)

df_preds = evaluate_all_models(ds, pipe_dict)
df_results = pd.concat([df, df_preds], axis=1)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (217 > 128). Running this sequence through the model will result in indexing errors


  0%|          | 0/1428 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

In [39]:
df_preds.sample(5)

Unnamed: 0,Mpnet,Bleurt,ModernBERT,AugmentedModernBERT
1344,1,1,1,1
3,0,1,0,1
1067,1,1,0,1
1384,1,1,1,1
425,0,1,1,0


In [40]:
df_results.to_csv("../../data/itell-question-data-w-preds.csv", index=False)

In [41]:
df_results["o3_binary"] = (df_results["o3_mini_score"] > 1).astype(int)
df_results["ensemble"] = df_results["Mpnet"] + df_results["Bleurt"]

df_results[[
    "o3_mini_score",
    "o3_binary",
    "ensemble",
    "Mpnet",
    "Bleurt",
    "ModernBERT",
    "AugmentedModernBERT",
]].corr(method="spearman")

Unnamed: 0,o3_mini_score,o3_binary,ensemble,Mpnet,Bleurt,ModernBERT,AugmentedModernBERT
o3_mini_score,1.0,0.774024,0.570712,0.540838,0.468128,0.486892,0.314204
o3_binary,0.774024,1.0,0.470355,0.467751,0.383013,0.416643,0.331107
ensemble,0.570712,0.470355,1.0,0.866354,0.883833,0.65531,0.39007
Mpnet,0.540838,0.467751,0.866354,1.0,0.552537,0.633455,0.39262
Bleurt,0.468128,0.383013,0.883833,0.552537,1.0,0.535407,0.31147
ModernBERT,0.486892,0.416643,0.65531,0.633455,0.535407,1.0,0.300353
AugmentedModernBERT,0.314204,0.331107,0.39007,0.39262,0.31147,0.300353,1.0
