In [1]:
from tqdm.auto import tqdm
import pandas as pd
import torch
import datasets
from transformers import pipeline, AutoTokenizer

torch.set_float32_matmul_precision('high')

In [2]:
df = pd.read_csv("../data/itell-question-data-w-o3-scores.csv")
display(df.sample(2))

ds = datasets.Dataset.from_pandas(df)
ds

Unnamed: 0,id,response,score,condition,user_id,page_slug,chunk_slug,created_at,volume_slug,volume_title,page_title,chunk_header,chunk_text,question,answer,o3_mini_score
717,2509,The pattern of convergence in low-income and m...,2,random_reread,nwttke5kgzf4woh6mnwydb6sha,7-4-production-in-the-long-run,Introduction-712t,2024-08-15 23:54:06.844876+00:00,cttc-poe,Principles of Macroeconomics,7.4 Economic Convergence,Introduction,Some low-income and middle-income economies ar...,What is the pattern of convergence observed in...,The pattern of convergence observed in low-inc...,3
1096,1456,They give an overall GDP,0,random_reread,td3nw5u4het7ryco6krwvl3ydm,7-2-labor-productivity-and-economic-growth,The-Power-of-Sustained-Economic-Growth-707t,2024-08-12 17:21:31.894547+00:00,cttc-poe,Principles of Macroeconomics,7.2 Labor Productivity and Economic Growth,The Power of Sustained Economic Growth,Nothing is more important for people’s standar...,What is the relationship between compound grow...,They have the same formula for growth over tim...,1


Dataset({
    features: ['id', 'response', 'score', 'condition', 'user_id', 'page_slug', 'chunk_slug', 'created_at', 'volume_slug', 'volume_title', 'page_title', 'chunk_header', 'chunk_text', 'question', 'answer', 'o3_mini_score'],
    num_rows: 1428
})

In [3]:
class Bleurt():
    model_name = "vaiibhavgupta/finetuned-bleurt-large"
    threshold = 0.7

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            device="cuda",
        )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        
        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        score = result[0]["score"]

        return 1 if score > self.threshold else 0

In [4]:
class Mpnet():
    model_name = "tiedaar/short-answer-classification"
    revision = "77b846ec4606bfcfdf913888d7f0ab51f977a579"

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            revision=self.revision,
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        
        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct_answer" else 0

In [32]:
class ModernBERT():
    model_name = "../results/modernbert_multirc/"
    tokenizer_name = "answerdotai/ModernBERT-base"

    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            tokenizer=AutoTokenizer.from_pretrained(self.tokenizer_name),
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        reference = input_dict.get("answer", "")
        candidate = input_dict.get("response", "")
        sequence = f"{candidate}</s>{reference}"

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct" else 0

In [33]:
class AugModernBert():
    """Augmented ModernBERT. Trained on mix of RACE and MultiRC"""
    model_name = "../results/modernbert_race_multirc"
    tokenizer_name = "answerdotai/ModernBERT-base"
    
    def __init__(self):
        self.classifier = pipeline(
            "text-classification",
            model=self.model_name,
            tokenizer=AutoTokenizer.from_pretrained(self.tokenizer_name),
            device="cuda",
            )

    def __call__(self, input_dict) -> int:
        passage = input_dict.get("chunk_text", "")
        question = input_dict.get("question", "")
        candidate = input_dict.get("response", "")
        
        sequence = f'{passage}\n\n{question}\n\n{candidate}'

        result = self.classifier(sequence)
        label = result[0]["label"]

        return 1 if label == "correct" else 0

In [36]:
# NOTE: these classes are not designed to take advantage of Pipeline's batching optimizations.

pipe_dict = {
    "Mpnet": Mpnet(),
    "Bleurt": Bleurt(),
    "ModernBERT": ModernBERT(),
    "AugmentedModernBERT": AugModernBert(),
}

Device set to use cuda
Device set to use cuda
Device set to use cuda
Device set to use cuda


In [37]:
pipe_dict["AugmentedModernBERT"]({
    "chunk_text": "This text states that all answers are good.",
    "question": "What is a good answer to this question?",
    "reference": "Goodness",
    "response": "This a strong answer to the question",
    })

1

In [39]:
def evaluate_all_models(dataset, pipe_dict, label_key="labels"):
    pred_dict = {name: [] for name in pipe_dict.keys()}

    for name, pipe in tqdm(pipe_dict.items(), total=len(pipe_dict)):
        for example in tqdm(dataset, total=len(dataset)):
            pred_dict[name].append(pipe(example))

    return pd.DataFrame(pred_dict)

df_preds = evaluate_all_models(ds, pipe_dict, label_key="o3_mini_scores")
df_results = pd.concat([df, df_preds], axis=1)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

  0%|          | 0/1428 [00:00<?, ?it/s]

In [40]:
df_preds.sample(10)

Unnamed: 0,Mpnet,Bleurt,ModernBERT,AugmentedModernBERT
725,1,1,1,0
1098,0,1,0,0
968,1,1,1,0
1421,1,1,1,1
1356,1,1,0,1
386,1,1,1,1
295,0,1,1,1
906,1,1,1,1
1151,1,1,1,1
1361,1,1,0,1


In [50]:
df["o3_mini_score"].value_counts()

o3_mini_score
2    565
1    342
4    277
3    244
Name: count, dtype: int64

In [56]:
df_results["o3_binary"] = (df["o3_mini_score"] > 1).astype(int)

df_results[[
    # "o3_mini_score",
    "o3_binary",
    "Mpnet",
    "Bleurt",
    "ModernBERT",
    "AugmentedModernBERT",
]].corr(method="spearman")

Unnamed: 0,o3_binary,Mpnet,Bleurt,ModernBERT,AugmentedModernBERT
o3_binary,1.0,0.467751,0.212341,0.416643,0.331107
Mpnet,0.467751,1.0,0.322989,0.633455,0.39262
Bleurt,0.212341,0.322989,1.0,0.327521,0.103492
ModernBERT,0.416643,0.633455,0.327521,1.0,0.300353
AugmentedModernBERT,0.331107,0.39262,0.103492,0.300353,1.0
