In [1]:
%load_ext autoreload
%autoreload 2

import asyncio
import json
import os
import traceback
from dotenv import load_dotenv
load_dotenv()

from hud import HUDClient
from hud.adapters.claude.adapter import ClaudeAdapter
from agent.claude import ClaudeAgent
from agent.async_claude import AsyncClaudeAgent

from anthropic import Anthropic, AsyncAnthropic
from pprint import pprint

In [2]:
# initialize HUD client
client = HUDClient(api_key=os.getenv("HUD_API_KEY"))

# initalize Claude Computer Use agent
anthropic = AsyncAnthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

In [None]:
# load OSWorld environment
gym = await client.load_gym(id="OSWorld-Ubuntu")

# load OSWorld evalset
evalset = await client.load_evalset(id="OSWorld-Ubuntu-Links")

# create a run that will host all evaluations
run = await client.create_run(name="Claude-test-OSWorld", gym=gym, evalset=evalset)

# fetch all task ids from the run
task_ids = await run.fetch_task_ids()
print(f"Total tasks in OSWorld: {len(task_ids)}")


In [5]:
# agent loop
MAX_STEPS = 30

# Limiting concurrent environment creation to 20 for testing purposes
semaphore = asyncio.Semaphore(30)

# Limiting concurrent Anthropic API calls to 3 to avoid rate limiting
anthropic_semaphore = asyncio.Semaphore(3)

async def agent_loop(run, task_id: str):
    # initialize adapter to interact with the environment
    cua_adapter = ClaudeAdapter()
    agent = AsyncClaudeAgent(anthropic)
    
    # Section 1: Environment initialization
    async with semaphore:
        try:
            print("Creating Environment")
            env = await run.make()
            await env.wait_for_ready()
            print("Resetting Environment")
            obs = await env.reset(task_id=task_id)
            print(f"Task description: {obs.text}")
        except Exception as e:
            with open('run_errors.ndjson', 'a') as f:
                error_log = {
                    'task_id': task_id,
                    'phase': 'initialization',
                    'error': str(e),
                    'traceback': traceback.format_exc()
                }
                f.write(json.dumps(error_log) + '\n')
            raise

    # Section 2: Agent loop
    for i in range(MAX_STEPS):
        try:
            # rescale screenshot to Claude's resolution
            screenshot = cua_adapter.rescale(obs.screenshot)

            # agent's next action
            async with anthropic_semaphore:
                done, response = await agent.predict(screenshot, obs.text)
                if done:
                    env.final_response = str(response)
                    break

            # convert to HUD action space
            actions = cua_adapter.adapt_list([response])
            print(f"Agent's action: {response}")

            # step the environment forward
            obs, reward, terminated, info = await env.step(actions)

            # drop out if terminated
            if terminated:
                break
            print(f"Step {i+1} completed")
        except Exception as e:
            with open('run_errors.ndjson', 'a') as f:
                error_log = {
                    'task_id': task_id,
                    'phase': 'agent_loop',
                    'error': str(e),
                    'traceback': traceback.format_exc()
                }
                f.write(json.dumps(error_log) + '\n')

    # Section 3: Evaluation
    try:
        result = await env.evaluate()
        print(f"Evaluation result: {result}")
    except Exception as e:
        with open('run_errors.ndjson', 'a') as f:
            error_log = {
                'task_id': task_id,
                'phase': 'evaluation',
                'error': str(e),
                'traceback': traceback.format_exc()
            }
            f.write(json.dumps(error_log) + '\n')
        raise

    # Section 4: Cleanup
    try:
        await env.close()
    except Exception as e:
        with open('run_errors.ndjson', 'a') as f:
            error_log = {
                'task_id': task_id,
                'phase': 'cleanup',
                'error': str(e),
                'traceback': traceback.format_exc()
            }
            f.write(json.dumps(error_log) + '\n')
        raise

In [None]:
# Run multiple agents in parallel
await asyncio.gather(*[agent_loop(run, task_id) for task_id in task_ids], return_exceptions=True)

In [None]:
analytics = await run.get_analytics()
pprint(analytics)

In [None]:
from pprint import pprint
# Get total count of tasks
total_tasks = len(analytics.raw_data["tasks"])

# Count tasks with non-null rewards
tasks_with_rewards = sum(1 for task in analytics.raw_data["tasks"] if task["reward"] is not None)

print(f"Total tasks: {total_tasks}")
print(f"Tasks with rewards: {tasks_with_rewards}")

In [5]:
run = await client.load_run(id="0eb4bab7-501b-46b6-a6a4-bf73a939619b")
trajectories = await run.get_trajectories()