In [1]:
import random

In [2]:
class TheoryStatus:
    def __init__(self):
        self.status = None

    def theory_status(self, success: bool) -> bool:
        """
        Log the theory status. This should be used at the very end.

        Args:
            success: A boolean indicating whether the theory was successful.
        Returns:
            A boolean indicating whether the logging was successful.
        """
        self.status = success
        return True

In [3]:
schemas = [
    {
        "name": "find_element",
        "description": "Find an element on a web page.",
        "parameters": {
            "type": "object",
            "properties": {
                "description": {
                    "type": "string",
                    "description": "A description of the element to find.",
                }
            },
            "required": ["description"],
        },
    },
    {
        "name": "click",
        "description": "Clicking on a web page, after clicking, will automatically wait for all elements to load.",
        "parameters": {
            "type": "object",
            "properties": {
                "coordinates": {
                    "type": "array",
                    "items": {"type": "integer"},
                    "description": "A tuple containing the x and y coordinates to click.",
                }
            },
            "required": ["coordinates"],
        },
    },
    {
        "name": "describe",
        "description": "Description of the current web page.",
        "parameters": {
            "type": "object",
            "properties": {},
            "required": [],
        },
    },
    {
        "name": "theory_status",
        "description": 'Log the theory status. This should be used at the very end.',
        "parameters": {
            "type": "object",
            "properties": {
                "success": {
                    "type": "boolean",
                    "description": 'A boolean indicating whether the theory was successful.',
                }
            },
            'required': ['success'],
        },
    },
]

In [4]:
def find_element(description: str) -> tuple[int, int]:
    """
    Find an element on a web page.

    Args:
        description: A description of the element to find.

    Returns:
        A tuple containing the x and y coordinates of the element.
    """
    return random.randint(0, 1000), random.randint(0, 1000)

def click(coordinates: tuple[int, int]) -> bool:
    """
    Clicking on a web page, after clicking, will automatically wait for all
    elements to load.

    Args:
        coordinates: A tuple containing the x and y coordinates to click.

    Returns:
        A boolean indicating whether the click was successful.
    """
    return True

def describe() -> str:
    """
    Description of the current web page.

    Returns:
        A string representing the caption.
    """
    # we are mocking so we'll just choose from a list of captions
    captions = [
        "A login screen with a username and password field.",
        "A search bar with a search button.",
        "A product page for a new smartphone by Apple, its price is $999.",
        "A news article about an Earthquake in California.",
    ]
    return random.choice(captions)

# def theory_status(success: bool) -> bool:
#     """
#     Log the theory status. This should be used at the very end.

#     Args:
#         success: A boolean indicating whether the theory was successful.
#     Returns:
#         A boolean indicating whether the logging was successful.
#     """
#     return True
ts = TheoryStatus()

tools = [
    find_element,
    click,
    describe,
    ts.theory_status,
]

In [5]:
key = "AIzaSyAuSJjugrpkM94g-sb-IlQ88lSTgEGAG08"

In [6]:
from gemini import GeminiAgent

In [7]:
agent = GeminiAgent(tools, schemas, key)

In [8]:
message = """
Theory: can find login button, click it, and the resulting page should be a login screen.
Start executing test using tools.
"""

In [9]:
agent.reset()

In [10]:
res = agent.run(message, recursion=8)



In [11]:
res

GenerateContentResponse(candidates=[Candidate(content=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=FunctionCall(id=None, args={'success': False}, name='theory_status'), function_response=None, inline_data=None, text=None)], role='model'), citation_metadata=None, finish_message=None, token_count=None, finish_reason=<FinishReason.STOP: 'STOP'>, avg_logprobs=-0.009817003272473812, grounding_metadata=None, index=None, logprobs_result=None, safety_ratings=None)], create_time=None, response_id=None, model_version='gemini-2.0-flash', prompt_feedback=None, usage_metadata=GenerateContentResponseUsageMetadata(cache_tokens_details=None, cached_content_token_count=None, candidates_token_count=4, candidates_tokens_details=[ModalityTokenCount(modality=<MediaModality.TEXT: 'TEXT'>, token_count=4)], prompt_token_count=245, prompt_tokens_details=[ModalityTokenCount(modality=<MediaModality.TEXT: 'TEXT'>, token_coun

In [12]:
res.text



In [13]:
agent.halt

True

In [14]:
agent.contents

[Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text='\nTheory: can find login button, click it, and the resulting page should be a login screen.\nStart executing test using tools.\n')], role='user'),
 Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text="Okay, let's start by finding the login button.\n")], role='model'),
 Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=FunctionCall(id=None, args={'description': 'login button'}, name='find_element'), function_response=None, inline_data=None, text=None)], role='model'),
 Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=N

In [15]:
ts.status

False