In [1]:
%load_ext autoreload
%autoreload 2


import os
from dotenv import load_dotenv
load_dotenv()

from pprint import pprint
from hud import HUDClient
from hud.adapters.claude.adapter import ClaudeAdapter
from agent.claude import ClaudeAgent

from anthropic import Anthropic

In [2]:
# initialize HUD client
client = HUDClient(api_key=os.getenv("HUD_API_KEY"))

# initalize Claude Computer Use agent
anthropic = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# initialize adapter to interact with the environment
cua_adapter = ClaudeAdapter()

In [3]:
# load OSWorld environment
gym = await client.load_gym(id="OSWorld-Ubuntu")

# load OSWorld evalset
evalset = await client.load_evalset(id="OSWorld-Ubuntu-Links")

# create a run that will host all evaluations
run = await client.create_run(name="Parth-Credentials-Test", gym=gym, evalset=evalset)

# fetch all task ids from the run
tasks = await run.fetch_task_ids()
print(f"Total tasks in OSWorld: {len(tasks)}")

Total tasks in OSWorld: 369


In [None]:
from pprint import pprint
# fetch all task ids from the run
tasks = await run.fetch_task_ids()
print(f"Total tasks in OSWorld: {len(tasks)}")
pprint(tasks)

In [32]:
# It may take around ~1 minute to initialize the OSWorld environment and reset to a task

# make a HUD environment
env = await run.make()
await env.wait_for_ready()

# # reset to a task with an observation (screenshot and text)
obs = await env.reset(task_id=tasks[1])
print(f"Task description: {obs.text}")

Task description: Change the representation of colum "Parameter" and show in Millions (M) and Billions (B). Keep one decimal and place a white space between the digits and the unit.


In [33]:
# load VNC server
live_url = await env.get_vnc_url()
live_url = f"http://{live_url}:5910/vnc.html"

client.display_stream(live_url)

In [16]:
live_url


In [6]:
# agent loop
agent = ClaudeAgent(anthropic)

for i in range(40):
    # rescale screenshot to Claude's resolution
    screenshot = cua_adapter.rescale(obs.screenshot)

    # agent's next action
    done, response = await agent.predict(screenshot, obs.text)
    if done:
        env.final_response = str(response)
        break

    # convert to HUD action space
    actions = cua_adapter.adapt_list([response])
    print(f"Agent's action: {response}")

    # step the environment forward
    obs, reward, terminated, info = await env.step(actions)

    # drop out if terminated
    if terminated:
        break
    print(f"Step {i+1} completed")


Agent's action: {'action': 'screenshot'}
Step 1 completed
Agent's action: {'action': 'left_click', 'coordinate': [46, 54]}
Step 2 completed
Agent's action: {'action': 'left_click', 'coordinate': [70, 323]}
Step 3 completed
Agent's action: {'action': 'left_click', 'coordinate': [826, 35]}
Step 4 completed
Agent's action: {'action': 'left_click', 'coordinate': [978, 32]}
Step 5 completed
Agent's action: {'action': 'left_click', 'coordinate': [254, 421]}
Step 6 completed
Agent's action: {'action': 'double_click', 'coordinate': [254, 421]}
Step 7 completed
Agent's action: {'action': 'left_click', 'coordinate': [96, 179]}
Step 8 completed
Agent's action: {'action': 'left_click', 'coordinate': [96, 209]}
Step 9 completed
Agent's action: {'action': 'left_click', 'coordinate': [80, 370]}
Step 10 completed
Agent's action: {'action': 'left_click', 'coordinate': [988, 583]}
Step 11 completed
Agent's action: {'action': 'left_click', 'coordinate': [96, 281]}
Step 12 completed
Agent's action: {'acti

In [None]:
# evaluate environment state
result = await env.evaluate()
print(f"Evaluation result: {result}")
pprint(result.__dict__)

Evaluation result: logs="INFO: Starting evaluation...\nINFO: Evaluating task 22a4636f-8179-4357-8e87-d1743ece1f81...\nINFO: Evaluator configuration:\nINFO:   Metric function(s): compare_pdfs\nINFO:   Metric conjunction: and\nINFO:   Result getter: get_googledrive_file\nINFO:   Expected getter: get_cloud_file\nINFO:   Metric options: {}\nINFO: Setting up post-config for evaluation...\nINFO: Evaluating single metric: compare_pdfs\nINFO: Getting result state using config: {'dest': 'Meeting-Agenda.pdf', 'path': ['meetings', 'Meeting-Agenda.pdf'], 'type': 'googledrive_file', 'settings_file': 'evaluation_examples/settings/googledrive/settings.yml'}\nINFO: Current file path: /app/osworld-hud/desktop_env/evaluators/getters/chrome.py\nINFO: Relative file path: ../osworld-hud/desktop_env/evaluators/getters/chrome.py\nINFO: access_token is expired. Now: 2025-04-07 15:56:00.236649, token_expiry: 2025-04-07 08:26:22\nINFO: Refreshing access_token\nINFO: file_cache is only supported with oauth2clien

In [9]:
# close environment
await env.close()

In [34]:
analytics = await run.get_analytics()
print(analytics)

Run: Parth-Credentials-Test (ID: f2e98d46-c6c0-478d-9752-ca54ce425735)
Created: 2025-04-07 15:43:46
------------------------------------------------------------
Progress: 1/5 tasks completed (
            20.0% completion rate)

Status Distribution:
running   : █████████████████████████████████████████ 5 (83.3%)
terminating: ████████ 1 (16.7%)

Average Score: 1.00
Score:  1.00/1.00
