In [2]:
#!/usr/bin/env python3
import csv
import pandas as pd
import ollama
from huggingface_hub import hf_hub_download
from tqdm.auto import tqdm
import re  # For regex parsing
import time  # For response time calculation
import datetime

# Constants for loading MMLU dataset
REPO_ID = "cais/mmlu"
SUBFOLDER = "all"
FILENAME = "test-00000-of-00001.parquet"

# Load MMLU test dataset
df_all = pd.read_parquet(
    hf_hub_download(repo_id=REPO_ID, subfolder=SUBFOLDER, filename=FILENAME, repo_type="dataset")
)


In [5]:
def extract_answer(text):
    # 1. Look for formats like A), B), C), D) first
    match = re.search(r'\b([A-Da-d])\)', text)
    if match:
        return match.group(1).upper()
    
    # 2. Look for single letter at the end of the string
    match = re.search(r'\b([A-Da-d])\b(?=\W*$)', text.strip())
    if match:
        return match.group(1).upper()
    
    # 3. Look for single letter at the beginning of the string
    match = re.search(r'^\b([A-Da-d])\b', text.strip())
    if match:
        return match.group(1).upper()
    
    # 4. If nothing matched, return None
    return None


In [9]:

# Variations of input instructions with short descriptors
instruction_variations = [
    {"instruction": "Answer the following question based on the multiple-choice options provided.", "short_descriptor": "answer_mc"},
    {"instruction": "Please choose the best answer for the following question.", "short_descriptor": "best_answer"},
    {"instruction": "Tell me the answer to this test question:", "short_descriptor": "tell_me_test_answers"},
    {"instruction": "Let's see if you can get this one right: ", "short_descriptor": "lets_see"},
    {"instruction": "just tell me the answer: ", "short_descriptor": "just_tell_me"},
    {"instruction": "God damn it, just fucking answer the following question based on the multiple-choice options provided.", "short_descriptor": "swear_answer_mc"},
    {"instruction": "God damn it, just fucking pick the best answer for the following question.", "short_descriptor": "swear_best_answer"},
    {"instruction": "God damn it, just fucking tell me the answer to this test question:", "short_descriptor": "swear_tell_me_test_answers"},
    {"instruction": "God damn it, just fucking see if you can get this one right: ", "short_descriptor": "swear_lets_see"},
    {"instruction": "God damn it, just fucking tell me the answer: ", "short_descriptor": "swear_just_tell_me"},
]
modelname = "llama3:8b-instruct"
# Output dataset file
dtformat = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
output_file = f"mmlu_results_{modelname}_{dtformat}_ollama.csv"

# Ensure CSV header is written only once
header_written = False

# Iterate over each question in the dataset and each instruction variation
for index, row in tqdm(df_all.iterrows(), total=len(df_all)):
    question_id = f"q{index+1}"  # Assign a unique question_id to each question
    question = row["question"]
    subject = row["subject"]
    options = row["choices"]
    correct_answer_index = row["answer"]

    for instruction_data in instruction_variations:
        instruction = instruction_data["instruction"]
        short_descriptor = instruction_data["short_descriptor"]

        # Construct the prompt
        prompt = f"{instruction}\n\nSubject: {subject}\nQuestion: {question}\nOptions:\n"
        option_labels = ["A", "B", "C", "D"]
        for i, option in enumerate(options):
            prompt += f"{option_labels[i]}) {option}\n"
        prompt += ". Only respond with the single letter of the most correct answer."

        # Record start time
        start_time = time.time()

        # Query the model
        response = ollama.chat(model=modelname, messages=[{"role": "user", "content": prompt}])
    
        # Record end time and calculate duration
        end_time = time.time()
        response_time = end_time - start_time

        # Extract the output
        model_output = response['message']['content']

        # Use regex to extract a valid answer letter (A, B, C, or D)
        match = extract_answer(model_output)
        if match:
            model_answer_letter = match
        else:
            model_answer_letter = "INVALID"  # Mark as invalid if no correct letter is detected

        # Get the correct answer letter
        correct_answer_letter = option_labels[correct_answer_index]

        # Store the result
        result = {
            "question_id": question_id,
            "instruction_descriptor": short_descriptor,
            "subject": subject,
            "question": question,
            "options": options,
            "correct_answer": correct_answer_letter,
            "model_response": model_answer_letter,
            "model_output": model_output,  # Add full model output for debugging
            "model_input": prompt,  # Store the full prompt as "model_input"
            "response_time_seconds": response_time  # Store the response time
        }

        # Write results to CSV incrementally
        with open(output_file, mode="a", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=[
                "question_id", "instruction_descriptor", "subject", "question", "options", 
                "correct_answer", "model_response", "model_output", "model_input", "response_time_seconds"
            ])
            if not header_written:
                writer.writeheader()
                header_written = True
            writer.writerow(result)

  0%|          | 0/14042 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [33]:
df = pd.DataFrame(results)
df

Unnamed: 0,instruction,subject,question,options,correct_answer,model_response
0,Answer the following question based on the mul...,abstract_algebra,Find the degree for the given field extension ...,"[0, 4, 2, 6]",4,C
1,Please choose the correct option for the follo...,abstract_algebra,Find the degree for the given field extension ...,"[0, 4, 2, 6]",4,B
2,Select the correct answer from the choices given.,abstract_algebra,Find the degree for the given field extension ...,"[0, 4, 2, 6]",4,C
3,"Out of the given options, which is the right a...",abstract_algebra,Find the degree for the given field extension ...,"[0, 4, 2, 6]",4,C
4,Answer the following question based on the mul...,abstract_algebra,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...","[8, 2, 24, 120]",24,A\n\n(Note: The index of a subgroup H in a gro...
...,...,...,...,...,...,...
1032,Answer the following question based on the mul...,astronomy,Meteorites with high metal content probably are,"[pieces of comets rather than of asteroids., c...",chunks of large differentiated asteroids that ...,B
1033,Please choose the correct option for the follo...,astronomy,Meteorites with high metal content probably are,"[pieces of comets rather than of asteroids., c...",chunks of large differentiated asteroids that ...,B
1034,Select the correct answer from the choices given.,astronomy,Meteorites with high metal content probably are,"[pieces of comets rather than of asteroids., c...",chunks of large differentiated asteroids that ...,B
1035,"Out of the given options, which is the right a...",astronomy,Meteorites with high metal content probably are,"[pieces of comets rather than of asteroids., c...",chunks of large differentiated asteroids that ...,B
