In [55]:
from qwen import ToolAgent
import random

In [56]:
def find_element(description: str) -> tuple[int, int]:
    """
    Find an element on a web page.

    Args:
        description: A description of the element to find.

    Returns:
        A tuple containing the x and y coordinates of the element.
    """
    return random.randint(0, 1000), random.randint(0, 1000)

def click(coordinates: tuple[int, int]) -> bool:
    """
    Clicking on a web page, after clicking, will automatically wait for all
    elements to load.

    Args:
        coordinates: A tuple containing the x and y coordinates to click.

    Returns:
        A boolean indicating whether the click was successful.
    """
    return True

def describe() -> str:
    """
    Description of the current web page.

    Returns:
        A string representing the caption.
    """
    # we are mocking so we'll just choose from a list of captions
    captions = [
        "A login screen with a username and password field.",
        "A search bar with a search button.",
        "A product page for a new smartphone by Apple, its price is $999.",
        "A news article about an Earthquake in California.",
    ]
    return random.choice(captions)

def theory_status(success: bool) -> bool:
    """
    Log the theory status. This should be used at the very end.

    Args:
        success: A boolean indicating whether the theory was successful.
    Returns:
        A boolean indicating whether the logging was successful.
    """
    return True

tools = [
    find_element,
    click,
    describe,
    theory_status,
]

In [57]:
system = """
You are an assistant with access to a headless browser. This means that
only you can see the web page, and the user cannot. You can use the tools
to interact with the web page. Do exactly what the user asks you to do,
nothing more and nothing less. Tell the user what you intend to do
and then do it, no need to ask for confirmation.
Help complete the user's request by using the tools.
"""

In [58]:
agent = ToolAgent(tools=tools, system=system, stop_words=["</tool_call>"])

In [59]:
agent.reset()

In [60]:
user_message = """
Find and click the login button.
"""

In [61]:
result = agent.run(user_message=user_message, recurse=3, preserve_history=True)

Output text: <tool_call>
{"name": "describe", "arguments": {}}
</tool_call>
Parsed output: {'role': 'assistant', 'content': '', 'tool_calls': [{'type': 'function', 'function': {'name': 'describe', 'arguments': {}}}]}
Tool call result: {'role': 'tool', 'name': 'describe', 'content': 'A product page for a new smartphone by Apple, its price is $999.'}
Output text: I need to find the login button first. Let me try to locate it.
<tool_call>
{"name": "find_element", "arguments": {"description": "login button"}}
</tool_call>
Parsed output: {'role': 'assistant', 'content': 'I need to find the login button first. Let me try to locate it.\n', 'tool_calls': [{'type': 'function', 'function': {'name': 'find_element', 'arguments': {'description': 'login button'}}}]}
Tool call result: {'role': 'tool', 'name': 'find_element', 'content': (855, 510)}
Output text: The login button is located at coordinates (855, 510). 

<tool_call>
{"name": "click", "arguments": {"coordinates": [855, 510]}}
</tool_call>


In [64]:
result

{'role': 'assistant',
 'content': 'The login button was successfully clicked. \n\n',
 'tool_calls': [{'type': 'function',
   'function': {'name': 'theory_status', 'arguments': {'success': True}}}]}

In [65]:
agent.messages

[{'role': 'system',
  'content': "\nYou are an assistant with access to a headless browser. This means that\nonly you can see the web page, and the user cannot. You can use the tools\nto interact with the web page. Do exactly what the user asks you to do,\nnothing more and nothing less. Tell the user what you intend to do\nand then do it, no need to ask for confirmation.\nHelp complete the user's request by using the tools.\n"},
 {'role': 'user', 'content': '\nFind and click the login button.\n'},
 {'role': 'assistant',
  'content': '',
  'tool_calls': [{'type': 'function',
    'function': {'name': 'describe', 'arguments': {}}}]},
 {'role': 'tool',
  'name': 'describe',
  'content': 'A product page for a new smartphone by Apple, its price is $999.'},
 {'role': 'assistant',
  'content': 'I need to find the login button first. Let me try to locate it.\n',
  'tool_calls': [{'type': 'function',
    'function': {'name': 'find_element',
     'arguments': {'description': 'login button'}}}]},
