In [1]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Browser Function Calling with the Vertex AI Gemini API & Python SDK

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/function-calling/intro_function_calling.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/function-calling/intro_function_calling.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/function-calling/intro_function_calling.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>


In [None]:
#upgrade the sdk for Gemini
# !pip3 install --upgrade --user google-cloud-aiplatform

In [2]:
# !pip install playwright > /dev/null
# !pip install  lxml

# If this is your first time using playwright, you'll have to install a browser executable.
# Running `playwright install` by default installs a chromium browser executable.
# !playwright install
# Also run this:
# !playwright install dependencies

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, it is recommended to restart the runtime. Run the following cell to restart the current kernel.

The restart process might take a minute or so.

In [3]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

#### We will be utilizing pre-built Langchain tools for the Playwright Browser

[Guide here](https://python.langchain.com/docs/integrations/toolkits/playwright) - most of this code is from this example.

In [1]:

import requests
from vertexai.preview.generative_models import (
    Content,
    FunctionDeclaration,
    GenerativeModel,
    Part,
    Tool,
)
from langchain.agents.agent_toolkits import PlayWrightBrowserToolkit
from langchain.tools.playwright.utils import (
    create_async_playwright_browser,  # A synchronous browser is available, though it isn't compatible with jupyter.
)
from langchain.tools import Tool as LangchainTool
# This import is required only for jupyter notebooks, since they have their own eventloop
import nest_asyncio

nest_asyncio.apply()


2023-12-14 23:37:53.529708: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-14 23:37:55.289169: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-12-14 23:37:55.289443: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or di

In [2]:
async_browser = create_async_playwright_browser()
toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=async_browser)
langchain_tools = toolkit.get_tools()
tools_by_name = {tool.name: tool for tool in langchain_tools}
tools_by_name

{'click_element': ClickTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/jwortz_google_com/.cache/ms-playwright/chromium-1084/chrome-linux/chrome> version=119.0.6045.9>),
 'navigate_browser': NavigateTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/jwortz_google_com/.cache/ms-playwright/chromium-1084/chrome-linux/chrome> version=119.0.6045.9>),
 'previous_webpage': NavigateBackTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/jwortz_google_com/.cache/ms-playwright/chromium-1084/chrome-linux/chrome> version=119.0.6045.9>),
 'extract_text': ExtractTextTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/jwortz_google_com/.cache/ms-playwright/chromium-1084/chrome-linux/chrome> version=119.0.6045.9>),
 'extract_hyperlinks': ExtractHyperlinksTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/jwortz_google_com/.cache/ms-playwright/chromi

#### Convert the args schema to a `FunctionDeclaration`

```python
tools[0].args_schema.schema()

{'title': 'ClickToolInput',
 'description': 'Input for ClickTool.',
 'type': 'object',
 'properties': {'selector': {'title': 'Selector',
   'description': 'CSS selector for the element to click',
   'type': 'string'}},
 'required': ['selector']}
 ```

In [4]:
from typing import Dict

def convert_function_declartion(langchain_tool: LangchainTool) -> FunctionDeclaration:
    """Converts a LangchainTool to a FunctionDeclaration

    Args:
        langchain_tool (LangchainTool): The LangchainTool to convert

    Returns:
        FunctionDeclaration: The FunctionDeclaration representation of the LangchainTool
    """
    langchain_args_schema = langchain_tool.args_schema.schema()
    if 'title' in langchain_args_schema.keys():
        langchain_args_schema.pop('title')
    if 'description' in langchain_args_schema.keys():
        description = langchain_args_schema.pop('description')
    else:
        description = langchain_tool.name
    for property in langchain_args_schema['properties'].keys():
        if 'title' in langchain_args_schema['properties'][property].keys():
            langchain_args_schema['properties'][property].pop('title')
        if 'default' in langchain_args_schema['properties'][property].keys():
            langchain_args_schema['properties'][property].pop('default')
        if 'type_' in  langchain_args_schema['properties'][property].keys():
            langchain_args_schema['properties'][property]['type'] = langchain_args_schema['properties'][property].pop('type_')
    function_declaration = FunctionDeclaration(
        name=langchain_tool.name,
        description=description,
        parameters=langchain_args_schema
    )
    return(function_declaration)


In [5]:
gemini_tools = [convert_function_declartion(tool) for tool in langchain_tools]
webbrowser_tools = Tool(gemini_tools)

### Use the Gemini Pro model

The Gemini Pro (`gemini-pro`) model is designed to handle natural language tasks, multiturn text and code chat, and code generation.

In [6]:
model = GenerativeModel("gemini-pro")

In [7]:
prompt = "What are the headers on langchain.com?"

response = model.generate_content(
    prompt,
    generation_config={"temperature": 0},
    tools=[webbrowser_tools]
)
response

candidates {
  content {
    role: "model"
    parts {
      function_call {
        name: "navigate_browser"
        args {
          fields {
            key: "url"
            value {
              string_value: "https://langchain.com/"
            }
          }
        }
      }
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 10
  total_token_count: 10
}

##### Quick inspection of one function call

In [8]:
response.candidates[0].content.parts[0].function_call

name: "navigate_browser"
args {
  fields {
    key: "url"
    value {
      string_value: "https://langchain.com/"
    }
  }
}

In [9]:
from google.cloud.aiplatform_v1beta1.types.tool import FunctionCall
from vertexai.generative_models._generative_models import GenerationResponse
from typing import List

async def use_browser_tools(part: Part, tools_by_name: Dict = tools_by_name):
    """
    Run a tool from the browser.

    Args:
        part: The part to run.
        tools_by_name: A dictionary of tool names to Langchain tools.

    Returns:
        A Part function response representing the results of the tool run.
    """

    single_function_call = part.function_call
    tool_name = single_function_call.name
    single_function_args = single_function_call.args
    if single_function_call.args is None:
        tool_params = {}
    else:
        tool_params = {k: single_function_args[k] for k in single_function_args}
    if tool_name != '':
        response = await tools_by_name[tool_name].arun(tool_params)
        return(Part.from_function_response(
        name=tool_name,
        response={
            "content": {"response": response},
        }
        ))


def get_parts_from_response(response: GenerationResponse) -> List[FunctionCall]:
    """
    Get the parts from a generation response.

    Args:
        response: The generation response.

    Returns:
        A list of parts.
    """
    return(response.candidates[0].content)

#### We will now do a mult-turn chat
Example can be found here on [function calling](https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/function-calling#function-calling-one-and-a-half-turn-curl-sample)

In [10]:
model = GenerativeModel("gemini-pro", 
                        generation_config={"temperature": 0},
                        tools=[webbrowser_tools])
chat = model.start_chat()

In [16]:
from pprint import pprint
response = chat.send_message("What are the headers (h1 and h2) on langchain.com?")
pprint(response)
function_calls = get_parts_from_response(response)
function_calls = function_calls.parts

# function_calls

tool_commands = [await use_browser_tools(fn_call) for fn_call in function_calls]
print(tool_commands)
if tool_commands[0] is None:
    pass #using this as a generic way to stop when there are no tool responses - looking for better way TODO
else:
    pprint([chat.send_message(command) for command in tool_commands])    

candidates {
  content {
    role: "model"
    parts {
      text: "Yes, that\'s correct. The h1 header on langchain.com is \"LangChain\", and the h2 headers are \"A complete set of powerful building blocks\", \"Smart connections to any source of data or knowledge\", \"Easily-swappable components\", \"A developer platform optimized for production-readiness\", \"One framework.\\nInfinite use-cases.\", \"Harness the power\342\200\223and wrangle the complexity\342\200\223 of LLMs\", \"Thanks to our contributors and partners for making LangChain awesome\", and \"Ready to build?\"."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_