Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
bc3a176
wip
theomonnom Feb 18, 2026
5d448df
cleanup computer-use: move provider tool defs to anthropic plugin
theomonnom Feb 22, 2026
341f831
move ComputerTool from browser plugin to anthropic plugin
theomonnom Feb 22, 2026
554db10
split action implementations into browser PageActions, anthropic Comp…
theomonnom Feb 22, 2026
a768470
move action-string dispatch to ComputerTool, PageActions is a clean t…
theomonnom Feb 22, 2026
c02f5e7
fix: left_click actually presses modifier keys, type_text sends full …
theomonnom Feb 22, 2026
0855213
PageActions takes x, y ints instead of coordinate sequences
theomonnom Feb 22, 2026
6a0b403
fix: horizontal scroll directions were swapped (left/right inverted)
theomonnom Feb 22, 2026
2e05a25
fix: type_text VK collision with nav keys, broadcast focus on agent r…
theomonnom Feb 22, 2026
6a7525f
fix: type_text matches Chrome keystroke sequence with correct base VK…
theomonnom Feb 22, 2026
d5f9ad6
fix: CEF CHAR events use char code as windows_key_code, fix scroll di…
theomonnom Feb 22, 2026
43a7ec0
feat: add navigate, go_back, go_forward browser tools for LLM agent
theomonnom Feb 22, 2026
193d1e2
simplify nav tool dispatch: drop indirection dict, inline calls
theomonnom Feb 22, 2026
2eb6031
remove dead code from function_tool stubs
theomonnom Feb 22, 2026
f5844d5
cleanup: remove section dividers, rename topics to include browser
theomonnom Feb 22, 2026
aed8b9f
add asyncio.Lock to PageActions to prevent interleaved actions
theomonnom Feb 22, 2026
9980ca9
fix: navigate tool was missing chat context update (broke LLM loop)
theomonnom Feb 22, 2026
220ba4f
refactor: move beta flag mapping to AnthropicTool, out of LLM provider
theomonnom Feb 22, 2026
1bd17f7
Add cursor position broadcasting for AI cursor overlay
theomonnom Feb 23, 2026
5f8bfa8
increase post-action delay to 0.8s for page settling
theomonnom Feb 23, 2026
9638e99
cleanup: remove docstrings from browser_agent
theomonnom Feb 23, 2026
fab3075
style: ruff format
theomonnom Feb 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion livekit-agents/livekit/agents/llm/_provider_format/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,18 @@ def to_chat_ctx(
}
)
elif msg.type == "function_call_output":
result_content: list[Any] | str = msg.output
try:
parsed = json.loads(msg.output)
if isinstance(parsed, list):
result_content = parsed
except (json.JSONDecodeError, TypeError):
pass
content.append(
{
"tool_use_id": msg.call_id,
"type": "tool_result",
"content": msg.output,
"content": result_content,
"is_error": msg.is_error,
}
)
Expand Down Expand Up @@ -131,4 +138,5 @@ def to_fnc_ctx(tool_ctx: llm.ToolContext) -> list[dict[str, Any]]:
"input_schema": info.raw_schema.get("parameters", {}),
}
)

return schemas
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,19 @@
See https://docs.livekit.io/agents/integrations/llm/anthropic/ for more information.
"""

from .computer_tool import ComputerTool
from .llm import LLM, LLMStream
from .log import logger
from .models import ChatModels
from .tools import AnthropicTool, ComputerUse
from .version import __version__

__all__ = [
"LLM",
"LLMStream",
"AnthropicTool",
"ComputerTool",
"ComputerUse",
"ChatModels",
"logger",
"__version__",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""ComputerTool — Anthropic computer_use Toolset backed by browser PageActions."""

from __future__ import annotations

import asyncio
import base64
import logging
from typing import TYPE_CHECKING, Any

from livekit import rtc
from livekit.agents import llm

from .tools import ComputerUse

if TYPE_CHECKING:
from livekit.plugins.browser import PageActions # type: ignore[import-untyped]

logger = logging.getLogger(__name__)

_POST_ACTION_DELAY = 0.3


class ComputerTool(llm.Toolset):
"""Anthropic computer_use tool backed by browser PageActions.

Usage::

from livekit.plugins.browser import PageActions

actions = PageActions(page=page)
tool = ComputerTool(actions=actions, width=1280, height=720)
"""

def __init__(
self,
*,
actions: PageActions,
width: int = 1280,
height: int = 720,
) -> None:
super().__init__(id="computer")
self._actions = actions
self._provider_tool = ComputerUse(
display_width_px=width,
display_height_px=height,
)

@property
def tools(self) -> list[llm.Tool]:
return [self._provider_tool]

async def execute(self, action: str, **kwargs: Any) -> list[dict[str, Any]]:
"""Dispatch an Anthropic computer_use action and return screenshot content."""
actions = self._actions

match action:
case "screenshot":
pass
case "left_click":
x, y = _require_coordinate(kwargs)
await actions.left_click(x, y, modifiers=kwargs.get("text"))
case "right_click":
x, y = _require_coordinate(kwargs)
await actions.right_click(x, y)
case "double_click":
x, y = _require_coordinate(kwargs)
await actions.double_click(x, y)
case "triple_click":
x, y = _require_coordinate(kwargs)
await actions.triple_click(x, y)
case "middle_click":
x, y = _require_coordinate(kwargs)
await actions.middle_click(x, y)
case "mouse_move":
x, y = _require_coordinate(kwargs)
await actions.mouse_move(x, y)
case "left_click_drag":
sx, sy = _require_coordinate(kwargs, key="start_coordinate")
ex, ey = _require_coordinate(kwargs)
await actions.left_click_drag(start_x=sx, start_y=sy, end_x=ex, end_y=ey)
case "left_mouse_down":
x, y = _require_coordinate(kwargs)
await actions.left_mouse_down(x, y)
case "left_mouse_up":
x, y = _require_coordinate(kwargs)
await actions.left_mouse_up(x, y)
case "scroll":
x, y = _require_coordinate(kwargs)
await actions.scroll(
x,
y,
direction=kwargs.get("scroll_direction", "down"),
amount=int(kwargs.get("scroll_amount", 3)),
)
case "type":
await actions.type_text(_require(kwargs, "text"))
case "key":
await actions.key(_require(kwargs, "text"))
case "hold_key":
await actions.hold_key(
_require(kwargs, "text"),
duration=float(kwargs.get("duration", 0.5)),
)
case "wait":
await actions.wait()
case _:
raise ValueError(f"Unknown computer_use action: {action!r}")

await asyncio.sleep(_POST_ACTION_DELAY)

frame = actions.last_frame
if frame is None:
return [{"type": "text", "text": "(no frame available yet)"}]
return _screenshot_content(frame)

def aclose(self) -> None:
self._actions.aclose()


def _require(kwargs: dict[str, Any], key: str) -> Any:
"""Extract a required argument, raising ValueError if missing."""
if key not in kwargs:
raise ValueError(f"Missing required argument: {key!r}")
return kwargs[key]


def _require_coordinate(kwargs: dict[str, Any], *, key: str = "coordinate") -> tuple[int, int]:
"""Extract and unpack a coordinate pair from Anthropic's [x, y] format."""
coord = _require(kwargs, key)
return int(coord[0]), int(coord[1])


def _screenshot_content(frame: rtc.VideoFrame) -> list[dict[str, Any]]:
"""Build Anthropic tool_result content blocks with a screenshot."""
from livekit.agents.utils.images import EncodeOptions, encode

png_bytes = encode(frame, EncodeOptions(format="PNG"))
b64 = base64.b64encode(png_bytes).decode("utf-8")
return [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": b64,
},
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,21 @@ def chat(

extra["max_tokens"] = self._opts.max_tokens if is_given(self._opts.max_tokens) else 1024

beta_flag: str | None = None
if tools:
extra["tools"] = llm.ToolContext(tools).parse_function_tools("anthropic")
from .tools import AnthropicTool

tool_ctx = llm.ToolContext(tools)
tool_schemas = tool_ctx.parse_function_tools("anthropic")

for tool in tool_ctx.provider_tools:
if isinstance(tool, AnthropicTool):
tool_schemas.append(tool.to_dict())
if tool.beta_flag:
beta_flag = tool.beta_flag

extra["tools"] = tool_schemas

tool_choice = (
cast(ToolChoice, tool_choice) if is_given(tool_choice) else self._opts.tool_choice
)
Expand Down Expand Up @@ -209,17 +222,27 @@ def chat(
content[-1]["cache_control"] = CACHE_CONTROL_EPHEMERAL # type: ignore
break

stream = self._client.messages.create(
messages=messages,
model=self._opts.model,
stream=True,
timeout=conn_options.timeout,
**extra,
)
if beta_flag:
stream = self._client.beta.messages.create(
betas=[beta_flag],
messages=messages, # type: ignore[arg-type]
model=self._opts.model,
stream=True,
timeout=conn_options.timeout,
**extra,
)
else:
stream = self._client.messages.create(
messages=messages,
model=self._opts.model,
stream=True,
timeout=conn_options.timeout,
**extra,
)

return LLMStream(
self,
anthropic_stream=stream,
anthropic_stream=stream, # type: ignore[arg-type]
chat_ctx=chat_ctx,
tools=tools or [],
conn_options=conn_options,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"claude-3-5-haiku-20241022",
"claude-3-7-sonnet-20250219",
"claude-sonnet-4-20250514",
"claude-sonnet-4-6",
"claude-opus-4-20250514",
"claude-opus-4-1-20250805",
"claude-opus-4-6",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any

from livekit.agents import ProviderTool


class AnthropicTool(ProviderTool, ABC):
@abstractmethod
def to_dict(self) -> dict[str, Any]: ...

@property
def beta_flag(self) -> str | None:
return None


_TOOL_VERSION_BETA_FLAGS: dict[str, str] = {
"computer_20251124": "computer-use-2025-11-24",
"computer_20250124": "computer-use-2025-01-24",
}


@dataclass
class ComputerUse(AnthropicTool):
display_width_px: int = 1280
display_height_px: int = 720
display_number: int = 1
tool_version: str = "computer_20251124"

def __post_init__(self) -> None:
super().__init__(id="computer")

@property
def beta_flag(self) -> str | None:
return _TOOL_VERSION_BETA_FLAGS.get(self.tool_version)

def to_dict(self) -> dict[str, Any]:
return {
"type": self.tool_version,
"name": "computer",
"display_width_px": self.display_width_px,
"display_height_px": self.display_height_px,
"display_number": self.display_number,
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,19 @@
PaintData,
)

from .browser_agent import BrowserAgent
from .log import logger
from .page_actions import PageActions
from .session import BrowserSession
from .version import __version__

__all__ = [
"AudioData",
"BrowserAgent",
"BrowserContext",
"BrowserPage",
"BrowserSession",
"PageActions",
"PaintData",
]

Expand Down
Loading