In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset, load_metric
import torch
import numpy as np
import pandas as pd

In [4]:
flan_dict = pd.read_csv("data/flan_collection_info.csv")

multi_choice_qa_tasks_list = flan_dict.loc[flan_dict["Generic Task Category"] == "Multiple-Choice QA (no trivia knowledge required)"]["Specific Task Category"].drop_duplicates().tolist()
multi_choice_qa_tasks_set = set(multi_choice_qa_tasks_list)

In [50]:
def format_mmlu_example(example, incl_answer = False):
    # Extracting the components of the example
    question = example['question']
    choices = example['choices']
    answer = example['answer']
    
    # Formatting the choices
    options = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
    
    # Formatting the entire example
    if incl_answer:
        formatted_example = f"Question: {question}\nOptions:\n{options}\nAnswer: {chr(65+answer)}"
    else:
        formatted_example = f"Question: {question}\nOptions:\n{options}\nAnswer:"
    
    return formatted_example

# Example usage

subject = 'abstract_algebra'

subject_testset = testset.filter(lambda example: example['subject'] == subject)
dev_set, subject_set = subject_testset.select(range(5)), subject_testset.select(range(5,len(subject_testset)))
formatted_egs = [format_mmlu_example(eg,incl_answer=True) for eg in dev_set]
five_shot_text = "\n\n".join(formatted_egs)
subject_testset

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 100
})

In [51]:
from evaluate import eval_mmlu

eval_mmlu('t5-small_LORA','t5-small_LORA')

OSError: t5-small_LORA is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
def format_mmlu_example(example, incl_answer = False):
    # Extracting the components of the example
    question = example['question']
    choices = example['choices']
    answer = example['answer']
    
    # Formatting the choices
    options = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
    
    # Formatting the entire example
    if incl_answer:
        formatted_example = f"Question: {question}\nOptions:\n{options}\nAnswer: {chr(65+answer)}"
    else:
        formatted_example = f"Question: {question}\nOptions:\n{options}\nAnswer:"
    
    return formatted_example

def eval_mmlu(model_path, tokenizer_path):
    """
    Evaluates a model on the MMLU dataset, using 5-shot prompting.

    Args:
        model_path (str): path to model
        tokenizer_path (str): path to model tokenizer
    
    Returns:
        test_accuracy (float): model accuracy
        subject_acc (dict): dictionary with accuracy by subject, with values ([num_correct,total],accuracy)
    """
    ## load the pretrained model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
    model.eval()

    ## datasets

    mmlu_dataset = load_dataset("cais/mmlu", 'all', split='test')

    ## metrics

    subjects = set(mmlu_dataset['subject'])
    subject_acc = {label: [[0, 0],0] for label in subjects}

    for subject in subjects:
        n = len(mmlu_dataset[mmlu_dataset['subject'] == subject])
        subject_acc[subject][0][1] = n-5
    
    ## evaluation
    correct = 0
    total = len(mmlu_dataset) - 5*len(set(mmlu_dataset['subject']))

    for subject in subjects:
        # FLAN uses 5-shot prompting for MMLU, so we use that here
        subject_set = mmlu_dataset.filter(lambda example: example['subject'] == subject)
        dev_set, test_set = subject_set.select(range(5)), subject_set.select(range(5,len(subject_set)))
        formatted_egs = [format_mmlu_example(eg,incl_answer=True) for eg in dev_set]
        five_shot_text = "\n\n".join(formatted_egs)

        for example in test_set:

            question = example['question']
            subject = example['subject']
            choices = example['options']
            answer = example['answer']
            
            # format inputs
            formatted_example = format_mmlu_example(example, incl_answer=False)
            input_text = '\n\n'.join(five_shot_text, formatted_example)
            input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
            
            # Generate an answer
            output_ids = model.generate(input_ids, max_length=1)  # Adjust max_length as needed
            output_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            
            # Assuming your model outputs the option letter (e.g., "A") as the answer
            predicted_option = ord(output_answer) - ord('A')
            
            # Update the metric
            if predicted_option == answer:
                correct += 1
                subject_acc[subject][0][0] += 1

        subject_acc[subject][1] = subject_acc[subject][0][0]/subject_acc[subject][0][1]
        
    test_accuracy = correct/total
    
    print(f"Accuracy on MMLU: {test_accuracy:.4f}")

    return test_accuracy, subject_acc