
# Unity Catalog Functions: Web Tools

In [0]:
%pip install -qq unitycatalog-ai[databricks] markdownify==1.2.2

%restart_python

In [0]:
from pprint import pprint

dbutils.widgets.text("catalog", "")
catalog = dbutils.widgets.get("catalog")

dbutils.widgets.text("schema", "")
schema = dbutils.widgets.get("schema")

# 表示
pprint(dbutils.widgets.getAll())

In [0]:
def http_request(
    url: str,
    method: str,
    headers: str,
    data: str,
    params: str,
    timeout: int,
) -> dict[str, str]:
    """
    Executes an HTTP request to an API or web service.

    Args:
        url (str): The destination URL for the request.
        method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
        headers (str): Additional HTTP headers as a JSON string.
        data (str): Data for the request body.
        params (str): URL query parameters as a JSON string.
        timeout (int): Request timeout in seconds. Specify 30 by default.

    Returns:
        dict[str, str]: A dictionary containing response data with the following keys:
            - success (str): Whether the request was successful.
            - status_code (str | None): HTTP status code.
            - headers (str): Response headers as a JSON string.
            - content (str): Response content (JSON string or text).
            - url (str): Final request URL.
    """
    import requests
    import json

    try:

        if headers:
            _headers = json.loads(headers)
        else:
            _headers = []

        if params:
            _params = json.loads(params)
        else:
            _params = {}

        try:
            _data = json.loads(data)
        except:
            _data = data

        response = requests.request(
            method=method.upper(),
            url=url,
            headers=_headers,
            params=_params,
            data=data if isinstance(_data, str) else None,
            json=data if isinstance(_data, dict) else None,
            timeout=timeout,
        )

        response.encoding = response.apparent_encoding
        content = response.text

        return {
            "success": str(response.status_code < 400),
            "status_code": str(response.status_code),
            "headers": json.dumps(dict(response.headers)),
            "content": content,
            "url": response.url,
        }

    except requests.exceptions.Timeout as e:
        return {
            "success": str(False),
            "status_code": None,
            "headers": "{}",
            "content": f"Request timeout error: {str(e)}",
            "url": url,
        }
    except requests.exceptions.RequestException as e:
        return {
            "success": str(False),
            "status_code": None,
            "headers": "{}",
            "content": f"Request error: {str(e)}",
            "url": url,
        }
    except Exception as e:
        return {
            "success": str(False),
            "status_code": None,
            "headers": "{}",
            "content": f"Unexpected error: {str(e)}",
            "url": url,
        }


def fetch_url(url: str, timeout: int, offset:int, limit:int) -> dict[str, str]:
    """
    Retrieves content from the specified URL and converts HTML to Markdown format.

    This tool fetches the contents of a web page and converts it into readable Markdown text.
    Use the obtained Markdown to generate natural and useful answers to user questions.
    Please retrieve the data by gradually expanding the offset and limit ranges.

    Args:
        url (str): Target URL to fetch (valid HTTP/HTTPS URL).
        timeout (int): Request timeout in seconds. Specify 30 by default.
        offset (int): Offset lines for markdown content. Specify 0 by default.
        limit (int): Limit lines for markdown content. Specify 100 by default.

    Returns:
        dict[str, str]: Returns a dictionary with the following keys:
            - success (str): Whether the request was successful.
            - url (str): Final URL after redirection.
            - markdown_content (str): Page content converted to Markdown format.
            - status_code (str): HTTP status code.
            - content_length (str): Number of characters in the Markdown content.
    """
    from markdownify import markdownify as md
    import requests
    import json

    try:
        response = requests.get(
            url,
            timeout=timeout,
        )

        if response.ok:
            response.encoding = response.apparent_encoding
            lines = md(response.text).splitlines()
            sliced_lines = lines[offset:offset + limit]
            markdown_content = "\n".join(sliced_lines)
        else:
            markdown_content = ""


        return {
            "success": str(response.ok),
            "status_code": str(response.status_code),
            "url": str(response.url),
            "markdown_content": markdown_content,
            "content_length": str(len(markdown_content)),
        }

    except Exception as e:
        return {
            "success": str(False),
            "status_code": None,
            "url": str(url),
            "markdown_content": f"Fetch URL error: {e!s}",
            "content_length": str(len(f"Fetch URL error: {e!s}")),
        }

# http_request(
#     "https://docs.databricks.com/aws/ja/generative-ai/agent-framework/create-custom-tool",
#     method="GET",
#     headers="",
#     data="",
#     params="",
#     timeout=30,
# )
# fetch_url(
#     "https://docs.databricks.com/aws/ja/generative-ai/agent-framework/create-custom-tool",
#     timeout=30,
#     offset=0,
#     limit=10
# )

In [0]:
from unitycatalog.ai.core.databricks import DatabricksFunctionClient

client = DatabricksFunctionClient()
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`")

client.create_python_function(
    func=http_request,
    catalog=catalog,
    schema=schema,
    replace=True,
)

client.create_python_function(
    func=fetch_url,
    catalog=catalog,
    schema=schema,
    replace=True,
    dependencies=["markdownify==1.2.2"],
)

In [0]:
# client.execute_function(
#     f"{catalog}.{schema}.http_request",
#     {
#         "url": "https://docs.databricks.com/",
#         "method": "GET",
#         "headers": "",
#         "data": "",
#         "params": "",
#         "timeout": 30,
#     },
# )

# client.execute_function(
#     f"{catalog}.{schema}.fetch_url",
#     {"url": "https://docs.databricks.com/dfasf", "timeout": 30},
# )