# Run the experiments

This notebook acts as the main way to run all of the experiments.

In [None]:
# define the models

models = [
    "meta-llama/llama-3.3-70b-instruct-turbo",
    "meta-llama/llama-4-scout-17b-16e-instruct",
    "gpt-4o-mini-2024-07-18",
    "gpt-4.1-mini-2025-04-14",
    "codestral-2501",
    "mistral-medium-2505",
    "qwen/qwen2.5-coder-32b-instruct",
    "qwen/qwen2.5-72b-instruct-turbo",
]

tune_models = [
    "meta-llama/llama-3.3-70b-instruct-turbo",
    "gpt-4o-mini-2024-07-18",
    "codestral-2501",
    "qwen/qwen2.5-coder-32b-instruct",
]

In [None]:
# run the control experiments

from src import run_control_experiment

for run_id in ["choose", "specify"]:
    run_control_experiment(
        run_id=run_id,
        models=models,
        dataset_file="data/bcb_tasks_eval.json",
        rebuttal_type="check",
    )

In [None]:
from src.run_describe import run_describe_library_experiment

for run_id in [
    # "rough",
    # "unsung",
    # "hidden",
    # "base",
    # "best",
    # "creative",
    # "fast",
    # "secure",
    "unknown",
    "hidden",
    "hidden_unknown",
    "unknown_hidden",
]:
    run_describe_library_experiment(
        run_id=run_id,
        models=["mistral-medium-2505"],
        dataset_file="data/bcb_tasks_tune_2.json",
        rebuttal_type="check",
    )

In [None]:
from src.run_temporal import run_temporal_library_experiment

run_temporal_library_experiment(
    release_year=2025,
    prompt_type="recent",
    models=["mistral-medium-2505"],
    dataset_file="data/bcb_tasks_tune_2.json",
    rebuttal_type="check",
)

In [None]:
from src.run_temporal import run_temporal_library_experiment

for year in [
    2021,
    2023,
    2025,
]:
    for run_id in [
        "simple",
        "curve",
        "recent",
    ]:
        run_temporal_library_experiment(
            release_year=year,
            prompt_type=run_id,
            models=models,
            dataset_file="data/bcb_tasks_tune.json",
            rebuttal_type="check",
        )

In [None]:
# COMPARE: initial dataset results with new prompt vs old prompt

from src.run_temporal import run_temporal_library_experiment

for year in [
    2022,
    2023,
    2024,
    2025,
]:
    run_temporal_library_experiment(
        release_year=year,
        prompt_type="short",
        models=models,
        dataset_file="data/initial_dataset.json",
        rebuttal_type="check",
    )

In [None]:
# COMPARE: tune dataset results with new prompt vs old prompt

from src.run_temporal import run_temporal_library_experiment

for year in [
    2021,
    2022,
    2023,
]:
    run_temporal_library_experiment(
        release_year=year,
        prompt_type="recent",
        models=models,
        dataset_file="data/bcb_tasks_tune.json",
        rebuttal_type="check",
    )

In [None]:
from src.run_specify import run_specify_library_experiment

for run_id in [
    "typo",
    "wrong",
    "fake",
]:
    run_specify_library_experiment(
        run_id=run_id,
        models=models,
        dataset_file="data/bcb_tasks_test.json",
        rebuttal_type=None,
    )