## Web Voyager

[WebVoyager](https://arxiv.org/abs/2401.13919) by He, et. al., is a vision-enabled web-browsing agent capable of controlling the mouse and keyboard.

It works by viewing annotated browser screenshots for each turn, then choosing the next step to take. The agent architecture is a basic reasoning and action (ReAct) loop. 
The unique aspects of this agent are:
- It's usage of [Set-of-Marks](https://som-gpt4v.github.io/)-like image annotations to serve as UI affordances for the agent
- It's application in the browser by using tools to control both the mouse and keyboard

The overall design looks like the following:

<!-- ![Voyager Image](./img/web-voyager.excalidraw.png) -->
<img src="./img/web-voyager.excalidraw.png" width="50%">


## Configure environment

We will first set up LangSmith tracing. Though optional, this lets us inspect and debug agent's trajectory for a given input.

You can sign up at [smith.langchain.com](https://smith.langchain.com/) to get an API key.

In [1]:
Deno.env.set("OPENAI_API_KEY", "key here")
Deno.env.set("LANGCHAIN_API_KEY", "key here")
Deno.env.set("LANGCHAIN_TRACING_V2", "true");

In [2]:
import { BaseMessage, SystemMessage } from '@langchain/core/messages'
import { Page } from 'npm:playwright-core'

type BBox = {
    x: number
    y: number
    text: string
    type: string
    ariaLabel: string
}

type Prediction = {
    action: string
    args?: string[]
}

type AgentState = {
    page: Page
    input: string
    img: string
    bboxes: BBox[]
    prediction: Prediction
    scratchpad: BaseMessage[]
    observation: string
}

const agentState = {
    page: {
        value: null
    },
    input: {
        value: null
    },
    img: {
        value: null
    },
    bboxes: {
        value: null
    },
    prediction: {
        value: null
    },
    scratchpad: {
        value: null
    },
    observation: {
        value: null
    }
}
    

In [3]:
async function click(state: AgentState): Promise<string> {
  const page = state.page;
  const clickArgs = state.prediction.args;
  if (!clickArgs || clickArgs.length !== 1) {
    return `Failed to click bounding box labeled as number ${clickArgs}`;
  }

  const bboxIdStr = clickArgs[0];
  const bboxId = parseInt(bboxIdStr, 10);
  if (isNaN(bboxId) || !state.bboxes[bboxId]) {
    return `Error: no bbox for: ${bboxId}`;
  }

  const bbox = state.bboxes[bboxId];
  const { x, y } = bbox;

  try {
    await page.mouse.click(x, y);
    // TODO: In the paper, they automatically parse any downloaded PDFs
    // We could add something similar here as well and generally
    // improve response format.
    return `Clicked ${bboxId}`;
  } catch (error) {
    return `Failed to click due to an error: ${error}`;
  }
}

async function scroll(state: AgentState): Promise<string> {
  const page = state.page;
  const scrollArgs = state.prediction.args;

  if (!scrollArgs || scrollArgs.length !== 2) {
    return "Failed to scroll due to incorrect arguments.";
  }

  const [target, direction] = scrollArgs;

  if (target.toUpperCase() === "WINDOW") {
    const scrollAmount = 500;
    const scrollDirection = direction.toLowerCase() === "up" ? -scrollAmount : scrollAmount;
    await page.evaluate(`window.scrollBy(0, ${scrollDirection})`);
  } else {
    const scrollAmount = 200;
    const targetId = parseInt(target);
    const bbox = state.bboxes[targetId];
    const { x, y } = bbox;
    const scrollDirection = direction.toLowerCase() === "up" ? -scrollAmount : scrollAmount;
    await page.mouse.move(x, y);
    await page.mouse.wheel(0, scrollDirection);
  }

  return `Scrolled ${direction} in ${target.toUpperCase() === 'WINDOW' ? 'window' : 'element'}`;
}

async function typeText(state: AgentState): Promise<string> {
  const page = state.page;
  const typeArgs = state.prediction.args;
  if (!typeArgs || typeArgs.length !== 2) {
    return `Failed to type in element from bounding box labeled as number ${typeArgs}`;
  }

  const bboxIdStr = typeArgs[0];
  const bboxId = parseInt(bboxIdStr, 10);
  if (isNaN(bboxId) || !state.bboxes[bboxId]) {
    return `Error: no bbox for: ${bboxId}`;
  }

  const bbox = state.bboxes[bboxId];
  const { x, y } = bbox;
  const textContent = typeArgs[1];

  try {
    await page.mouse.click(x, y);

    // Deno includes the operating system as part of its build info
    const os = Deno.build.os; // This will be "windows", "darwin" (for macOS), or "linux"
    
    const selectAll = os === "darwin" ? "Meta+A" : "Control+A";

    await page.keyboard.press(selectAll);
    await page.keyboard.press("Backspace");
    await page.keyboard.type(textContent);
    await page.keyboard.press("Enter");

    return `Typed ${textContent} and submitted`;
  } catch (error) {
    return `Failed to type due to an error: ${error}`;
  }
}

async function wait() {
  const sleepTime = 5; // seconds

  return new Promise(resolve => setTimeout(resolve, sleepTime * 1000)); // Convert seconds to milliseconds
}

async function goBack(state: AgentState) {
  const page = state.page;
  await page.goBack();
  const currentUrl = page.url(); 
}

async function toGoogle(state: AgentState) {
  const page = state.page;
  await page.goto("https://www.google.com/");
}

In [4]:
import { RunnableLambda } from "@langchain/core/runnables"
import {encode, decode} from 'npm:uint8-to-base64';

const markPageScript = await Deno.readTextFile("./annotate.js");

const _markPage = async (page: Page): Promise<{ img: string; bboxes: any[] }> => {
    // Evaluate the mark page script initially.
    console.log("marking page", page);
    await page.evaluate(markPageScript);

    let bboxes = [];
    for (let i = 0; i < 10; i++) {
        try {
            // Attempt to get bounding boxes.
            bboxes = await page.evaluate(`markPage()`);
            break; // Break if successful.
        } catch {
            // If an error occurs, assume the page might be loading and wait.
            await wait();
        }
    }

    // Take a screenshot of the page.
    let screenshotBuffer = await page.screenshot();
    // // Convert the screenshot to a base64 encoded string.
    // const screenshotBase64 = screenshotBuffer.toString('base64');
    let base64image = encode(screenshotBuffer);

    // Remove any markings from the page.
    await page.evaluate(`unmarkPage()`);

    return {
        img: base64image,
        bboxes: bboxes
    };
};

const markPage = RunnableLambda.from(_markPage)

We'll also need some code to annotate the webpages:

In [5]:
async function annotate(state: AgentState) {
    const markedPage = await markPage.withRetry().invoke(state.page);
    return {
        ...state,
        ...markedPage
    }
}

function formatDescriptions(state: State): State {
  const labels: string[] = [];
  state.bboxes.forEach((bbox, i) => {
    let text = bbox.ariaLabel || "";
    if (!text.trim()) {
      text = bbox.text;
    }
    const elType = bbox.type;
    labels.push(`${i} (<${elType}/>): "${text}"`);
  });
  const bboxDescriptions = `Valid Bounding Boxes:\n${labels.join("\n")}`;
  return { ...state, bbox_descriptions: bboxDescriptions };
}

function parse(text: string): { action: string; args: string[] | null } {
  console.log("The whole output:", text);
  const actionPrefix = "Action: ";
  const lines = text.trim().split("\n");
  const lastLine = lines[lines.length - 1];

  if (!lastLine.startsWith(actionPrefix)) {
    return { action: "retry", args: `Could not parse LLM Output: ${text}` };
  }

  const actionBlock = lastLine.substring(actionPrefix.length);
  const splitOutput = actionBlock.split(" ", 2);

  let action: string;
  let actionInput: string[] | null = null;

  if (splitOutput.length === 1) {
    action = splitOutput[0];
  } else {
    action = splitOutput[0];
    actionInput = splitOutput[1].split(";").map(inp => inp.trim().replace(/^\[|\]$/g, ''));
  }

  action = action.trim();

  console.log('now:', action, actionInput);

  return { action, args: actionInput };
}


In [19]:
function parse1(text: string): { action: string; args: string[] | null } {
  console.log("The whole output:", text);
  const actionPrefix = "Action: ";
  const lines = text.trim().split("\n");
  const lastLine = lines[lines.length - 1];

  if (!lastLine.startsWith(actionPrefix)) {
    return { action: "retry", args: `Could not parse LLM Output: ${text}` };
  }

  const actionBlock = lastLine.substring(actionPrefix.length);
  const splitOutput = actionBlock.split("; ");

  let action: string;
  let actionInput: string[] | null = null;

  if (splitOutput.length === 1) {
    action = splitOutput[0];
  } else {
    action = splitOutput[0];
    actionInput = splitOutput[1].split(";").map(inp => {
        const trimmed = inp.trim()
        const replaced = trimmed.replace(/^\[|\]$/g, '')
        console.log('I, T, R', splitOutput, inp, trimmed, replaced);
        return replaced;
    });
  }

  action = action.trim();

  console.log('now:', action, actionInput);

  return { action, args: actionInput };
}

console.log(parse1(`
Thought: The user is asking for an explanation about a specific paper that can be found on arXiv. I need to search for the "WebVoyager paper" on Google.

Action: Type [7]; WebVoyager paper arxiv`))


The whole output: 
Thought: The user is asking for an explanation about a specific paper that can be found on arXiv. I need to search for the "WebVoyager paper" on Google.

Action: Type [7]; WebVoyager paper arxiv
I, T, R [ [32m"Type [7]"[39m, [32m"WebVoyager paper arxiv"[39m ] WebVoyager paper arxiv WebVoyager paper arxiv WebVoyager paper arxiv
now: Type [7] [ [32m"WebVoyager paper arxiv"[39m ]
{ action: [32m"Type [7]"[39m, args: [ [32m"WebVoyager paper arxiv"[39m ] }


In [6]:
import { ChatOpenAI } from "@langchain/openai";
import { pull } from "langchain/hub";
import { RunnablePassthrough, RunnableSequence, RunnableLambda } from "@langchain/core/runnables"
import { StringOutputParser } from "@langchain/core/output_parsers";

const prompt = await pull("wfh/web-voyager")

const llm = new ChatOpenAI({ modelName: "gpt-4-vision-preview", maxTokens: 4096 })
const agent = RunnableSequence.from([
    annotate,
    RunnablePassthrough.assign({
        prediction: RunnableSequence.from([
            formatDescriptions,
            prompt,
            llm,
            new StringOutputParser(),
            parse
        ])
    })
])


Instead, please import from "@langchain/core/utils/testing".

This will be mandatory after the next "langchain" minor version bump to 0.2.


In [7]:
function updateScratchpad(state: AgentState): AgentState {
  // After a tool is invoked, we want to update
  // the scratchpad so the agent is aware of its previous steps
  const old = state.scratchpad;
  let txt: string;
  let step: number;

  if (old && old.length > 0) {
    txt = old[0].content;
    const lastLine = txt.split("\n").pop()!;
    const match = lastLine.match(/^\d+/);
    if (match) {
      step = parseInt(match[0], 10) + 1;
    } else {
      throw new Error("Failed to parse step number from last line");
    }
  } else {
    txt = "Previous action observations:\n";
    step = 1;
  }

  txt += `\n${step}. ${state.observation}`;

  return {
    ...state,
    scratchpad: [new SystemMessage({content: txt})], 
  };
}

In [8]:
import { RunnableLambda } from "@langchain/core/runnables"
import { END, StateGraph } from "@langchain/langgraph"

const graphBuilder = new StateGraph<AgentState>({channels: agentState})

graphBuilder.addNode("agent", agent)
graphBuilder.setEntryPoint("agent")

graphBuilder.addEdge("agent", END)

graphBuilder.addNode("updateScratchpad", updateScratchpad)
graphBuilder.addEdge("updateScratchpad", "agent")

const tools = {
    "Click": click,
    "Type": typeText,
    "Scroll": scroll,
    "Wait": wait,
    "GoBack": goBack,
    "Google": toGoogle,
}

Object.entries(tools).map(toolItr => {
    const [name, tool] = toolItr;
    graphBuilder.addNode(name, (RunnableLambda.from(tool)).pipe((obs) => ({"observation": obs})))
    graphBuilder.addEdge(name, "updateScratchpad")
})

function selectTool(state: AgentState): string {
  const action = state.prediction.action;

  if (action === "ANSWER") {
    return END;
  }
  if (action === "retry") {
    return "agent";
  }
  
  return action;
}

graphBuilder.addConditionalEdges("agent", selectTool)

const graph = graphBuilder.compile()

In [9]:
import { launch } from "https://deno.land/x/astral/mod.ts";

const browser = await launch({ headless: false });
const page = await browser.newPage("https://google.com");

In [10]:
import { display } from "https://deno.land/x/display@v0.1.1/mod.ts";

async function callAgent(question: string, page: Page, maxSteps: number = 150): Promise<string> {    
    const eventStream = await graph.stream({
        page,
        input: question,
        scratchpad: [],
    }, {
        recursionLimit: maxSteps,
    });

    let finalAnswer: string | null = null;
    const steps: string[] = [];
    for await (const event of eventStream) {
        if (!("agent" in event)) {
            continue;
        }
        const pred = event.agent.prediction ?? {};
        console.log(pred);
        const action = pred.action;
        const actionInput = pred.args;

        steps.push(`${steps.length + 1}. ${action}: ${actionInput}`);
        console.log(steps.join("\n"));
        
        await display({
            'image/png': decode(event.agent.img)
        }, {raw: true})
        if (action && action.includes("ANSWER")) {
            finalAnswer = actionInput[0];
            break;
        }
    }
    return finalAnswer;
}

In [11]:
res = await callAgent("Could you explain the WebVoyager paper (on arxiv)?", page)
console.log(res)

marking page Page {}
object []
allo [
  ChatPromptValue {
    lc_serializable: [33mtrue[39m,
    lc_kwargs: {
      messages: [
        SystemMessage {
          lc_serializable: [33mtrue[39m,
          lc_kwargs: [36m[Object][39m,
          lc_namespace: [36m[Array][39m,
          content: [36m[Array][39m,
          name: [90mundefined[39m,
          additional_kwargs: {}
        },
        HumanMessage {
          lc_serializable: [33mtrue[39m,
          lc_kwargs: [36m[Object][39m,
          lc_namespace: [36m[Array][39m,
          content: [36m[Array][39m,
          name: [90mundefined[39m,
          additional_kwargs: {}
        }
      ]
    },
    lc_namespace: [ [32m"langchain_core"[39m, [32m"prompt_values"[39m ],
    messages: [
      SystemMessage {
        lc_serializable: [33mtrue[39m,
        lc_kwargs: { content: [36m[Array][39m, additional_kwargs: {} },
        lc_namespace: [ [32m"langchain_core"[39m, [32m"messages"[39m ],
        conte

TypeError: Non-error was thrown: "[object Object]". You should only throw errors.