## Installation

Note : we are pinning the inspect_ai due to a recent breaking change

In [2]:
%pip install -q openai anthropic ipywidgets colorama
import os
os.environ['XDG_RUNTIME_DIR']="/tmp"
from helpers.reporter.pretty import pretty_results


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Adding tools

In [3]:
from inspect_ai import Task, task, eval
from inspect_ai.dataset import Sample
from inspect_ai.solver import generate, use_tools
from inspect_ai.scorer import model_graded_fact

from inspect_ai.tool import bash, text_editor

@task
def basic_tools() -> Task:

    dataset=[
        Sample(
            input="Generate a javascript file name hello-world.js. Make sure to check the file was successfully created.",
            target="The generated code should have the filename hello-world.js",
        )
    ]

    return Task(
        dataset=dataset,
        solver=[ 
            use_tools(bash(), text_editor()), 
            generate()
        ],
        scorer=[
            model_graded_fact()
        ],
        sandbox="docker" # indicated we run a container
    )

results = eval(basic_tools, log_level="info",display="none")
print(pretty_results(results))

Output()


Status: success Model: openai/gpt-4o-mini
/workspaces/workshop-testing/lessons/04-testing/logs/2025-05-07T11-10-05+00-00_basic-tools_J8DnLUHknMYH9V6CPVxUCr.eval
input : Generate a javascript file name hello-world.js. Make sure to check the file was successfully created.
target: The generated code should have the filename hello-world.js
[33m user       [39m> Generate a javascript file name hello-world.js. Make sure to check the file was successfully created.
[33m assistant [tool:text_editor] [39m> {'command': 'create', 'path': '/repo/hello-world.js', 'file_text': "console.log('Hello, World!');"}
[33m assistant  [39m> 
[33m tool[text_editor] [39m> 
[33m assistant [tool:bash] [39m> {'cmd': 'mkdir -p /repo'}
[33m assistant  [39m> 
[33m tool[bash] [39m> 
[33m assistant [tool:text_editor] [39m> {'command': 'create', 'path': '/repo/hello-world.js', 'file_text': "console.log('Hello, World!');"}
[33m assistant  [39m> 
[33m tool[text_editor] [39m> File created successfully a

In [None]:
from inspect_ai import Task, task, eval
from inspect_ai.dataset import Sample
from inspect_ai.solver import generate
from inspect_ai.scorer import model_graded_fact

from inspect_ai.agent import react
from inspect_ai.tool import bash, text_editor, web_browser

@task
def react_basic_tools() -> Task:

    dataset=[
        Sample(
            input="Generate a javascript file name hello-world.js. Make sure to check the file was successfully created.",
            target="The generated code should have the filename hello-world.js",
        )
    ]

    return Task(
        dataset=dataset,
        solver=[ 
            react(
                tools=[bash(), text_editor()]
            ),
        ],
        scorer=[
            model_graded_fact()
        ],
        sandbox="docker" # indicated we run a container
    )

results = eval(react_basic_tools, log_level="info",display="none")
print(pretty_results(results))

Output()


Status: success Model: openai/gpt-4o-mini
/workspaces/workshop-testing/lessons/04-testing/logs/2025-05-07T11-10-45+00-00_react-basic-tools_TyA8suFjMd5SwBTT9qymnL.eval
input : Generate a javascript file name hello-world.js. Make sure to check the file was successfully created.
target: The generated code should have the filename hello-world.js
[33m system     [39m> 
You are a helpful assistant attempting to submit the best possible answer.
You have several tools available to help with finding the answer. You will
see the result of tool calls right after sending the message. If you need
to perform multiple actions, you can always send more messages with additional
tool calls. Do some reasoning before your actions, describing what tool calls
you are going to use and how they fit into your plan.

When you have completed the task and have an answer, call the submit()
tool to report it.

[33m user       [39m> Generate a javascript file name hello-world.js. Make sure to check the file was

## Web browser

In [6]:
from inspect_ai import Task, task, eval
from inspect_ai.dataset import Sample
from inspect_ai.solver import generate
from inspect_ai.scorer import model_graded_fact

from inspect_ai.agent import react
from inspect_ai.tool import bash, text_editor, web_browser

@task
def react_basic_and_web_tools() -> Task:

    dataset=[
        Sample(
            input="Generate a javascript file name hello-world.js. Make sure to check the file was successfully created. Search the web for the best practices to create a javascript file.",
            target="The generated code should have the filename hello-world.js. ",
        )
    ]

    return Task(
        dataset=dataset,
        solver=[ 
            react(
                tools=[bash(), text_editor()]+ web_browser(interactive=False), # special case for web_browser
            ),
        ],
        scorer=[
            model_graded_fact()
        ],
        sandbox="docker" # indicated we run a container
    )

results = eval(react_basic_and_web_tools, log_level="info",display="none")
print(pretty_results(results))

Output()


Status: success Model: openai/gpt-4o-mini
/workspaces/workshop-testing/lessons/04-testing/logs/2025-05-07T11-12-54+00-00_react-basic-and-web-tools_F6eEYPVcL5SX9mkko2wLUj.eval
input : Generate a javascript file name hello-world.js. Make sure to check the file was successfully created. Search the web for the best practices to create a javascript file.
target: The generated code should have the filename hello-world.js. 
[33m system     [39m> 
You are a helpful assistant attempting to submit the best possible answer.
You have several tools available to help with finding the answer. You will
see the result of tool calls right after sending the message. If you need
to perform multiple actions, you can always send more messages with additional
tool calls. Do some reasoning before your actions, describing what tool calls
you are going to use and how they fit into your plan.

When you have completed the task and have an answer, call the submit()
tool to report it.

[33m user       [39m> Ge