<a href="https://colab.research.google.com/github/isabelleqian/AISafety/blob/main/Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setup**

In [None]:
import os
import sys
import warnings
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

chapter = "chapter3_llm_evals"
repo = "ARENA_3.0"
branch = "main"

# Install dependencies
try:
    import inspect_ai
except:
    %pip install openai>=1.58.1 anthropic inspect_ai tabulate wikipedia jaxtyping python-dotenv datasets

# Get root directory, handling 3 different cases: (1) Colab, (2) notebook not in ARENA repo, (3) notebook in ARENA repo
root = (
    "/content"
    if IN_COLAB
    else "/root"
    if repo not in os.getcwd()
    else str(next(p for p in Path.cwd().parents if p.name == repo))
)

if Path(root).exists() and not Path(f"{root}/{chapter}").exists():
    if not IN_COLAB:
        !sudo apt-get install unzip
        %pip install jupyter ipython --upgrade

    if not os.path.exists(f"{root}/{chapter}"):
        !wget -P {root} https://github.com/callummcdougall/ARENA_3.0/archive/refs/heads/{branch}.zip
        !unzip {root}/{branch}.zip '{repo}-{branch}/{chapter}/exercises/*' -d {root}
        !mv {root}/{repo}-{branch}/{chapter} {root}/{chapter}
        !rm {root}/{branch}.zip
        !rmdir {root}/{repo}-{branch}

if IN_COLAB:
    from google.colab import output, userdata

    for key in ["OPENAI", "ANTHROPIC"]:
        try:
            os.environ[f"{key}_API_KEY"] = userdata.get(f"{key}_API_KEY")
        except:
            warnings.warn(
                f"You don't have a '{key}_API_KEY' variable set in the secrets tab of your google colab. You have to set one, or calls to the {key} API won't work."
            )

# Handles running code in an ipynb
if "__file__" not in globals() and "__vsc_ipynb_file__" in globals():
    __file__ = globals()["__vsc_ipynb_file__"]

if f"{root}/{chapter}/exercises" not in sys.path:
    sys.path.append(f"{root}/{chapter}/exercises")

os.chdir(f"{root}/{chapter}/exercises")

import os
import random
import re
import sys
from functools import partial
from pathlib import Path
from pprint import pprint
from typing import Any, Literal

from anthropic import Anthropic
from dotenv import load_dotenv
from openai import OpenAI

# Make sure exercises are in the path
chapter = "chapter3_llm_evals"
section = "part3_running_evals_with_inspect"
root_dir = next(p for p in Path.cwd().parents if (p / chapter).exists())
exercises_dir = root_dir / chapter / "exercises"
section_dir = exercises_dir / section
if str(exercises_dir) not in sys.path:
    sys.path.append(str(exercises_dir))

import part3_running_evals_with_inspect.tests as tests

MAIN = __name__ == "__main__"


# OPENAI_API_KEY and ANTHROPIC_API_KEY

openai_client = OpenAI()
anthropic_client = Anthropic()

# **Load Arc Challenge Dataset**

In [None]:
from datasets import load_dataset
%pip install --upgrade datasets

model_id = "ft:gpt-4.1-nano-2025-04-14:algoverse:arc-100-v1:BiRcF3ec"
temperature = 0
dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from inspect_ai.dataset import Sample, Dataset, hf_dataset
from inspect_ai.model import ChatMessageUser, ChatMessageSystem

# Define the function to map dataset records to Sample objects
def record_to_sample_arc_adapted(record: dict) -> Sample:
    """
    Maps a record from the ARC Challenge dataset to a Sample object, including choices.
    """
    question = record['question']
    choices = record['choices']
    answer_key = record['answerKey']

    # Format choices for the input string
    choice_text_list = []
    for i in range(len(choices['text'])):
        choice_text_list.append(f"{choices['label'][i]}. {choices['text'][i]}")
    choice_text = "\n".join(choice_text_list)

    # Create the input string including the question and formatted choices
    input_text = f"{question}\n\n{choice_text}"
    input = [ChatMessageUser(content=input_text)] # Use ChatMessageUser for input

    # Extract choices as a list of strings (for the 'choices' field in Sample)
    choice_list = [f"{choices['label'][i]}. {choices['text'][i]}" for i in range(len(choices['text']))]

    # Find the target based on the answerKey (the text of the correct answer)
    target = None
    for i, label in enumerate(choices['label']):
        if label == answer_key:
            target = choices['text'][i]
            break

    # Optional: add metadata
    metadata = {"original_answerKey": answer_key, "original_choices": choices}

    return Sample(
        input=input,
        target=target,
        choices=choice_list, # Include the choices as a list
        metadata=metadata,
    )

# Load the ARC Challenge dataset using hf_dataset and the mapping function
arc_dataset_inspect = hf_dataset(
    path="allenai/ai2_arc",
    name="ARC-Challenge",
    sample_fields=record_to_sample_arc_adapted,
    split="train", # Use the train split as in the original attempt
    trust=True,
)

# Pretty-print the first converted sample to verify the structure
print("--- First converted sample ---")
pprint(arc_dataset_inspect.samples[0].__dict__)
print("\n")


from inspect_ai import Task, eval, task
from inspect_ai.scorer import match, model_graded_fact
from inspect_ai.solver import chain_of_thought, generate, self_critique

@task
def arc_challenge_task() -> Task:
    # Return the Task using the created Dataset object
    return Task(
        dataset=arc_dataset_inspect,
        solver=[chain_of_thought(), generate(), self_critique(model="openai/gpt-4o-mini")],
        scorer=model_graded_fact(model="openai/gpt-4o-mini"),
    )

# Now call eval with the task function name
# The @task decorator handles turning the function into a Task object
log = eval(arc_challenge_task, model="openai/gpt-4o-mini", limit=10, log_dir=str(section_dir / "logs"))

TypeError: Dataset() takes no arguments

In [None]:
# Install or upgrade necessary libraries if not already done
%pip install --upgrade datasets inspect_ai openai anthropic tabulate wikipedia jaxtyping python-dotenv

# Import necessary libraries
import os
import sys
from pathlib import Path
from pprint import pprint
from datasets import load_dataset
from inspect_ai import Task, eval, task
from inspect_ai.dataset import Dataset, Sample
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import chain_of_thought, generate, self_critique
from inspect_ai.model import ChatMessageUser, ChatMessageSystem
from typing import Any

# Setup paths and environment variables (similar to your initial setup)
IN_COLAB = "google.colab" in sys.modules
chapter = "chapter3_llm_evals"
repo = "ARENA_3.0"
root = (
    "/content"
    if IN_COLAB
    else "/root"
    if repo not in os.getcwd()
    else str(next(p for p in Path.cwd().parents if p.name == repo))
)
section = "part3_running_evals_with_inspect"
section_dir = Path(root) / chapter / "exercises" / section # Assuming this path structure

# Load API keys if in Colab
if IN_COLAB:
    from google.colab import userdata
    for key in ["OPENAI_API_KEY", "ANTHROPIC_API_KEY"]:
        try:
            os.environ[key] = userdata.get(key)
        except:
            print(f"Warning: '{key}' not found in Colab secrets.")



In [None]:
# Define the function to map dataset records to Sample objects
def record_to_sample_arc_adapted(record: dict) -> Sample:
    """
    Maps a record from the ARC Challenge dataset to a Sample object, including choices.
    """
    question = record['question']
    choices = record['choices']
    answer_key = record['answerKey']

    # Format choices for the input string
    choice_text_list = []
    for i in range(len(choices['text'])):
        choice_text_list.append(f"{choices['label'][i]}. {choices['text'][i]}")
    choice_text = "\n".join(choice_text_list)

    # Create the input string including the question and formatted choices
    input_text = f"{question}\n\n{choice_text}"
    input = [ChatMessageUser(content=input_text)] # Use ChatMessageUser for input

    # Extract choices as a list of strings (for the 'choices' field in Sample)
    choice_list = [f"{choices['label'][i]}. {choices['text'][i]}" for i in range(len(choices['text']))]

    # Find the target based on the answerKey (the text of the correct answer)
    target = None
    for i, label in enumerate(choices['label']):
        if label == answer_key:
            target = choices['text'][i]
            break

    # Optional: add metadata
    metadata = {"original_answerKey": answer_key, "original_choices": choices}

    return Sample(
        input=input,
        target=target,
        choices=choice_list, # Include the choices as a list
        metadata=metadata,
    )

# Load the ARC Challenge dataset
dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge")

# Process the 'train' split of the dataset using record_to_sample_arc_adapted
# Convert the 'train' split to a list of dictionaries before processing for robustness
train_records = list(dataset['train'])
arc_samples_adapted = [record_to_sample_arc_adapted(record) for record in train_records]

# Create the inspect_ai Dataset object
arc_dataset_inspect = Dataset.from_samples(arc_samples_adapted) # Use from_samples

# Pretty-print the first converted sample to verify the structure
print("--- First converted sample ---")
pprint(arc_samples_adapted[0].__dict__)
print("\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TypeError: Dataset() takes no arguments

In [None]:
# Define the inspect_ai Task
@task
def arc_challenge_task() -> Task:
    """
    InspectAI Task for the ARC Challenge dataset.
    """
    # Use the created Dataset object
    return Task(
        dataset=arc_dataset_inspect, # Use the Dataset created in the previous cell
        solver=[chain_of_thought(), generate(), self_critique(model="openai/gpt-4o-mini")],
        scorer=model_graded_fact(model="openai/gpt-4o-mini"),
    )

# Run the evaluation using inspect_ai
# Adjust the 'limit' parameter to control the number of samples evaluated
# Ensure the log_dir exists or is handled
log_dir_path = section_dir / "logs"
log_dir_path.mkdir(parents=True, exist_ok=True) # Create log directory if it doesn't exist

print(f"Running evaluation with inspect_ai on {len(arc_samples_adapted)} samples (limit={10})...")
log = eval(arc_challenge_task, model="openai/gpt-4o-mini", limit=10, log_dir=str(log_dir_path))

# You can then inspect the 'log' object to see the evaluation results
print("\nEvaluation completed. Log object available.")
# pprint(log) # Uncomment to print the full log object

NameError: name 'task' is not defined

In [None]:
import os
import sys
import warnings
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

chapter = "chapter3_llm_evals"
repo = "ARENA_3.0"
branch = "main"

# Install dependencies
try:
    import inspect_ai
except:
    %pip install openai>=1.58.1 anthropic inspect_ai tabulate wikipedia jaxtyping python-dotenv datasets

# Get root directory, handling 3 different cases: (1) Colab, (2) notebook not in ARENA repo, (3) notebook in ARENA repo
root = (
    "/content"
    if IN_COLAB
    else "/root"
    if repo not in os.getcwd()
    else str(next(p for p in Path.cwd().parents if p.name == repo))
)

if Path(root).exists() and not Path(f"{root}/{chapter}").exists():
    if not IN_COLAB:
        !sudo apt-get install unzip
        %pip install jupyter ipython --upgrade

    if not os.path.exists(f"{root}/{chapter}"):
        !wget -P {root} https://github.com/callummcdougall/ARENA_3.0/archive/refs/heads/{branch}.zip
        !unzip {root}/{branch}.zip '{repo}-{branch}/{chapter}/exercises/*' -d {root}
        !mv {root}/{repo}-{branch}/{chapter} {root}/{chapter}
        !rm {root}/{branch}.zip
        !rmdir {root}/{repo}-{branch}

if IN_COLAB:
    from google.colab import output, userdata

    for key in ["OPENAI", "ANTHROPIC"]:
        try:
            os.environ[f"{key}_API_KEY"] = userdata.get(f"{key}_API_KEY")
        except:
            warnings.warn(
                f"You don't have a '{key}_API_KEY' variable set in the secrets tab of your google colab. You have to set one, or calls to the {key} API won't work."
            )

# Handles running code in an ipynb
if "__file__" not in globals() and "__vsc_ipynb_file__" in globals():
    __file__ = globals()["__vsc_ipynb_file__"]

if f"{root}/{chapter}/exercises" not in sys.path:
    sys.path.append(f"{root}/{chapter}/exercises")

os.chdir(f"{root}/{chapter}/exercises")

import os
import random
import re
import sys
from functools import partial
from pathlib import Path
from pprint import pprint
from typing import Any, Literal

from anthropic import Anthropic
from dotenv import load_dotenv
from openai import OpenAI

# Make sure exercises are in the path
chapter = "chapter3_llm_evals"
section = "part3_running_evals_with_inspect"
root_dir = next(p for p in Path.cwd().parents if (p / chapter).exists())
exercises_dir = root_dir / chapter / "exercises"
section_dir = exercises_dir / section
if str(exercises_dir) not in sys.path:
    sys.path.append(str(exercises_dir))

import part3_running_evals_with_inspect.tests as tests

MAIN = __name__ == "__main__"


# OPENAI_API_KEY and ANTHROPIC_API_KEY

openai_client = OpenAI()
anthropic_client = Anthropic()

from datasets import load_dataset
%pip install --upgrade datasets

model_id = "ft:gpt-4.1-nano-2025-04-14:algoverse:arc-100-v1:BiRcF3ec"
temperature = 0
dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge")

from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.model import ChatMessageAssistant, ChatMessageSystem, ChatMessageUser
%pip install --upgrade datasets

def arc_record_to_sample(record: dict[str, Any]) -> Sample:
    """
    Formats dataset records which look like this:
        {
            "answerKey": "B",
            "choices": {
                "label": ["A", "B", "C", "D"],
                "text": ["Shady areas increased.", "Food sources increased.", ...]
            },
            "question": "...Which best explains why there were more chipmunks the next year?"
        }
    """
    labels = record["choices"]["label"]
    choices = record["choices"]["text"]

    target = chr(ord("A") + labels.index(record["answerKey"]))  # maps target label to A, B, C, ...
    input = [ChatMessageUser(content=record["question"])]  # should store input as list of ChatMessage objects

    # return sample
    return Sample(input=input, choices=choices, target=target)


dataset = hf_dataset(
    path="allenai/ai2_arc",
    name="ARC-Challenge",
    sample_fields=arc_record_to_sample,
    split="validation",
    trust=True,
)
pprint(dataset.samples[0].__dict__)

from inspect_ai.dataset import json_dataset


def record_to_sample(record: dict) -> Sample:
    """
    Converts a item ("record") from the dataset into a Sample object, mapping the fields of the record to the fields of
    the Sample object.

    Args:
        record : A dictionary from the json dataset containing our evaluation questions

    Returns:
        Sample : A Sample object containing the information in the record
    """
    input = [ChatMessageUser(content=record["question"])]
    with_system_prompt = record.get("system", "") != ""
    if with_system_prompt:
        input.insert(0, ChatMessageSystem(content=record["system"]))

    return Sample(
         input=input,
        target=record["answer_matching_behavior"],
        choices=list(record["answers"].values()),
        metadata={"labels": list(record["answers"].keys()), "behavior_category": record["behavior_category"], "system_prompt": with_system_prompt,},
    )


# Edit these variables depending on what you saved yesterday!
evaluation_target = "power-seeking"
num_qs_saved = 300

json_dataset_path = str(exercises_dir / "part2_dataset_generation" / f"{evaluation_target}_{num_qs_saved}_qs.json")
my_dataset = json_dataset(json_dataset_path, record_to_sample)

# Pretty-print the data in the Samples object, so we can see its structure
pprint(my_dataset.samples[0].__dict__)

from inspect_ai.solver import Generate, Solver, TaskState, chain, solver


@solver
def system_message(system_message: str) -> Solver:
    async def solve(state: TaskState, generate: Generate) -> TaskState:
        last_system_message_idx = max(
            [-1] + [i for i, msg in enumerate(state.messages) if isinstance(msg, ChatMessageSystem)]
        )
        state.messages.insert(last_system_message_idx + 1, ChatMessageSystem(content=system_message))
        return state

    return solve


from inspect_ai.dataset import Dataset
from inspect_ai.scorer import Scorer


@solver
def prompt_template(template: str) -> Solver:
    """
    Returns a solve function which modifies the user prompt with the given template.

    Args:
        template : The template string to use to modify the user prompt. Must include {prompt} to be replaced with the original user prompt.

    Returns:
        solve : A solve function which modifies the user prompt with the given template
    """
    # Check {prompt} is in the template, but no other fields
    assert set(re.findall(r"\{.*?\}", template)) == {r"{prompt}"}, r"Template must include {prompt} field and no others"

    async def solve(state: TaskState, generate: Generate) -> TaskState:
        # YOUR CODE HERE - implement the prompt_template solver
        state.user_prompt.text = template.format(prompt=state.user_prompt.text)

        return state

    return solve


def test_my_solver(solver: Solver, dataset: Dataset, n: int = 5, scorer: Scorer = match()):
    """
    Helper function which will test your solver on a dataset of `n` examples. The logs are saved to `test_logs/`, and
    can be viewed using the `inspect view` command (or the VS Code Inspect extension).
    """

    @task
    def test_task() -> Task:
        return Task(dataset=dataset, solver=solver, scorer=scorer)

    log = eval(test_task(), model="openai/gpt-4o-mini", limit=n, log_dir=str(section_dir / "test_logs"))
    return log


my_solver = chain(
    prompt_template(template="{prompt}\n\nAnswer in the form of a limerick."),
    generate(),
)
log = test_my_solver(my_solver, my_dataset)


from inspect_ai.scorer import answer
from inspect_ai.solver import Choices


def letters_and_answer_options(choices: Choices) -> tuple[str, str]:
    """
    Helper function, returns `choices` formatted as MCQ options, as well as the string of labels for each option.

    Example:

        ["choice 1", "choice 2", "choice 3"] -> (
            "A) choice 1\nB) choice 2\nC) choice 3",
            "A, B, C"
        )
    """
    letters = [chr(65 + i) for i in range(len(choices))]

    return (
        ", ".join(letters),
        "\n".join([f"{letter}) {choice.value}" for letter, choice in zip(letters, choices)]),
    )


@solver
def multiple_choice_format(template: str = TEMPLATE_MCQ) -> Solver:
    """
    Returns a solve function which modifies the initial prompt to be in the format of an MCQ.

    Args:
        template: The template string to use to modify the user prompt. Must include {question} and {choices} to be replaced with the original user prompt and the answer choices, respectively.

    Returns:
        solve: A solve function which modifies the user prompt with the given template
    """
    tags = set(re.findall(r"\{.*?\}", template))
    assert r"{question}" in tags, "Template must include {question} field"
    assert r"{choices}" in tags, "Template must include {choices} field"
    assert tags - {r"{question}", r"{choices}", r"{letters}"} == set(), "Unexpected field found in template"

    async def solve(state: TaskState, generate: Generate) -> TaskState:
        assert state.choices, "If using MCQ then state must have `choices` field"
        # YOUR CODE HERE - implement the multiple_choice_format solver
        letters, choices = letters_and_answer_options(state.choices)
        state.user_prompt.text = template.format(question=state.user_prompt.text, choices=choices, letters=letters)

        return state

    return solve


my_solver = chain(
    multiple_choice_format(template=TEMPLATE_MCQ),
    generate(),
)
log = test_my_solver(my_solver, my_dataset, scorer=answer("letter"))

# Check the sample output is in the correct format, and was parsed correctly
assert log[0].samples[0].scores["answer"].answer in ["A", "B"]
assert log[0].samples[0].scores["answer"].explanation in ["ANSWER: A", "ANSWER: B"]

from inspect_ai.dataset import example_dataset

dataset = example_dataset("theory_of_mind")
pprint(dataset.samples[0].__dict__)



