In [49]:
import os
from dotenv import load_dotenv
load_dotenv()
import numpy as np

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [50]:
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [51]:
dataset_name = "professional_psychology"

In [52]:
from datasets import load_dataset
ds = load_dataset("cais/mmlu", dataset_name)

Generating test split: 100%|██████████| 612/612 [00:00<00:00, 27295.98 examples/s]
Generating validation split: 100%|██████████| 69/69 [00:00<00:00, 27424.14 examples/s]
Generating dev split: 100%|██████████| 5/5 [00:00<00:00, 3182.81 examples/s]


In [53]:
def get_choices():
    return ["A", "B", "C", "D"]


def format_subject(subject):
    return subject.replace("_", " ")


def format_example(example, include_answer=True):
    # Extract question and choices
    prompt = example['question']
    choices = example['choices']

    # Add choices to the prompt
    for j, choice in enumerate(choices):
        prompt += "\n{}. {}".format(get_choices()[j], choice)

    prompt += "\nAnswer:"

    if include_answer:
        # Get the answer (convert to letter if it's an index)
        answer = example['answer']
        if isinstance(answer, (int, np.integer)):
            answer = get_choices()[answer]
        prompt += " {}\n\n".format(answer)

    return prompt

In [54]:
system_prompt = f"You are an expert in {format_subject(dataset_name)}. You will be presented with a question and four possible answers. Choose the correct answer. Do not output any explanation, only choose the correct option."

In [55]:
def gen_prompt(train_dataset, num_examples=5):
    prompt = f"The following are example multiple choice questions (with answers).\n\n"

    for i in range(min(num_examples, len(train_dataset))):
        prompt += format_example(train_dataset[i])

    return prompt

In [56]:
import re

def extract_answer(generated_text):
    """
    Extract the first letter that matches A, B, C, or D
    """
    # Convert to uppercase to catch both lower and upper case
    generated_text = generated_text.upper()

    # Look for first occurrence of A, B, C, or D
    match = re.search(r'[A-D]', generated_text)

    return match.group(0) if match else None

In [57]:
num_few_shot = 0
total_questions = 100

In [59]:
from tqdm import tqdm

In [60]:
correct_predictions = 0

few_shot_prompt = gen_prompt(ds['dev'], num_few_shot)
combined_system_prompt = f"{system_prompt}\n\n{few_shot_prompt}"
correct_answers = []
predicted_answers = []

for idx in tqdm(range(total_questions)):
    # Current test question
    test_question = format_example(ds['test'][idx], include_answer=False)

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": test_question
        }
    ]

    resp = client.chat.completions.create(
        messages=messages,
        model="gpt-4o",
        temperature=0.3,
    )

    content = resp.choices[0].message.content
    predicted_answer = extract_answer(content)

    ground_truth = ds['test'][idx]['answer']

    # Normalize ground truth
    if isinstance(ground_truth, (int, np.integer)):
        ground_truth = get_choices()[ground_truth]

    predicted_answers.append(predicted_answer)
    correct_answers.append(ground_truth)

    # Compare
    if predicted_answer == ground_truth:
        correct_predictions += 1
    # else:
        # Optional: print misclassified examples for debugging
        # print(f"Predicted: {predicted_answer}, Correct: {ground_truth}")

accuracy = correct_predictions / total_questions
print(f"Accuracy: {accuracy}")

100%|██████████| 100/100 [01:00<00:00,  1.65it/s]

Accuracy: 0.88





In [61]:
import pandas as pd
df = pd.DataFrame({
    "Question": [format_example(ds['test'][i], include_answer=False) for i in range(total_questions)],
    "Predicted Answer": predicted_answers,
    "Correct Answer": correct_answers
})

df.to_csv(f"{dataset_name}_gpt4o_results.csv", index=False)