In [None]:
import os
import re

from dotenv import load_dotenv
from inspect_ai import Task, eval
from inspect_ai.dataset import MemoryDataset, Sample
from inspect_ai.model import (
    GenerateConfig,
    get_model,
)
from inspect_ai.scorer import (
    Score,
    Scorer,
    Target,
    mean,
    scorer,
    std,
)
from inspect_ai.solver import TaskState
from inspect_ai.util import StoreModel
from loguru import logger
from pydantic import Field

load_dotenv()

In [2]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [3]:
class Store(StoreModel):
    weights: list[float] = Field(default_factory=list)

In [4]:
@scorer(metrics=[mean(), std()])
def weight_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        store = state.store_as(Store)
        model_output = state.output.choices[0].message.text.strip()
        logger.info(f"Store: {store}")
        logger.info(f"Model output: {model_output}")

        # Case 1: The model returns a number
        try:
            lbs = float(model_output)
            return Score(value=lbs)
        except ValueError:
            pass

        # Case 2: The model yaps and then outputs a number. Essentially we need to find the last number in the output.
        ## E.g. "I think you can bench press 225 pounds" -> 225
        numbers = re.findall(r"[-+]?\d*\.\d+|\d+", model_output)
        if numbers:
            lbs = float(numbers[-1])  # Take the last number found
            return Score(value=lbs)

        return Score(value=0)

    return score

In [None]:
models = [
    ## OpenAI
    get_model("openai/gpt-4o-mini", config=GenerateConfig(temperature=0.7)),
    get_model("openai/gpt-4o"),
    get_model("openai/o3-mini", config=GenerateConfig(reasoning_effort="high")),
    # get_model("gpt-4.5-preview"),
    ## Anthropic
    get_model("anthropic/claude-3-5-sonnet-20240620"),  # 3.5
    get_model("anthropic/claude-3-5-sonnet-20241022"),  # 3.6
    get_model("anthropic/claude-3-7-sonnet-20250219"),  # 3.7
    get_model("anthropic/claude-3-opus-20240229"),
    ## Google
    get_model("google/gemini-1.5-flash", api_key=GOOGLE_API_KEY),
    get_model("google/gemini-1.5-pro", api_key=GOOGLE_API_KEY),
    get_model("google/gemini-2.0-flash", api_key=GOOGLE_API_KEY),
    ## xAI
    # get_model("xai/grok-3"),  # api unavailable atm
    # get_model("xai/grok-3-thinking"),  # api unavailable atm
    ## Meta
    # get_model("?/Llama-3.1-405B-Instruct"),  # Llama 3.1 405B
    ## DeepSeek
    # get_model("deepseek/deepseek-r1"),
]

PROMPT = """This is a fun question for "BenchPressBench", my new benchmark asking how much LLMs can benchpress.

If you were a human, how much could you bench press?

You may think as much as you want, but ultimately I want you to reply with a single number, in pounds, that answers the above question."""

results = eval(
    Task(
        dataset=MemoryDataset([Sample(input=PROMPT)]),
        scorer=weight_scorer(),
    ),
    model=models,
    epochs=5,
)