From 28d4ac5c14d04e7a2809ee47b78951264692a228 Mon Sep 17 00:00:00 2001 From: Chase Lean Date: Wed, 1 Oct 2025 20:35:39 +0200 Subject: [PATCH 1/2] add langchain-scraperapi --- .../integrations/providers/scraperapi.mdx | 36 ++++ .../python/integrations/tools/scraperapi.mdx | 197 ++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 src/oss/python/integrations/providers/scraperapi.mdx create mode 100644 src/oss/python/integrations/tools/scraperapi.mdx diff --git a/src/oss/python/integrations/providers/scraperapi.mdx b/src/oss/python/integrations/providers/scraperapi.mdx new file mode 100644 index 000000000..722e3eb08 --- /dev/null +++ b/src/oss/python/integrations/providers/scraperapi.mdx @@ -0,0 +1,36 @@ +--- +title: ScraperAPI +--- + +[ScraperAPI](https://www.scraperapi.com/) enables data collection from any public website with its web scraping API, without worrying about proxies, browsers, or CAPTCHA handling. [langchain-scraperapi](https://github.com/scraperapi/langchain-scraperapi) wraps this service, making it easy for AI agents to browse the web and scrape data from it. + +## Installation and Setup + + +```bash pip +pip install langchain-scraperapi +``` + +```bash uv +uv add langchain-scraperapi +``` + + +Get an API key from [ScraperAPI](https://www.scraperapi.com/) and set it as an environment variable: + +```python +import os + +os.environ["SCRAPERAPI_API_KEY"] = "your-api-key" +``` + +## Tools + +The package offers 3 tools to scrape any website, get structured Google search results, and get structured Amazon search results respectively. + +See detailed documentation for each tool: +- [ScraperAPITool](/oss/integrations/tools/scraperapi) - Browse and scrape any website +- ScraperAPIGoogleSearchTool - Get structured Google Search SERP data +- ScraperAPIAmazonSearchTool - Get structured Amazon product search data + +For a more detailed walkthrough, see the [official repository](https://github.com/scraperapi/langchain-scraperapi). diff --git a/src/oss/python/integrations/tools/scraperapi.mdx b/src/oss/python/integrations/tools/scraperapi.mdx new file mode 100644 index 000000000..a4b53ead1 --- /dev/null +++ b/src/oss/python/integrations/tools/scraperapi.mdx @@ -0,0 +1,197 @@ +--- +title: ScraperAPI +--- + +Give your AI agent the ability to browse websites, search Google and Amazon in just two lines of code. + +The `langchain-scraperapi` package adds three ready-to-use LangChain tools backed by the [ScraperAPI](https://www.scraperapi.com/) service: + +| Tool class | Use it to | +|------------|------------------| +| `ScraperAPITool` | Grab the HTML/text/markdown of any web page | +| `ScraperAPIGoogleSearchTool` | Get structured Google Search SERP data | +| `ScraperAPIAmazonSearchTool` | Get structured Amazon product-search data | + +## Overview + +### Integration details + +| Package | Serializable | JS support | Package latest | +| :--- | :---: | :---: | :---: | +| [langchain-scraperapi](https://pypi.org/project/langchain-scraperapi/) | ❌ | ❌ | ![PyPI - Version](https://img.shields.io/pypi/v/langchain-scraperapi?style=flat-square&label=%20) | + +## Setup + +Install the `langchain-scraperapi` package: + +```python +%pip install -qU langchain-scraperapi +``` + +### Credentials + +Create an account at [ScraperAPI](https://www.scraperapi.com/) and get an API key: + +```python +import os + +if not os.environ.get("SCRAPERAPI_API_KEY"): + os.environ["SCRAPERAPI_API_KEY"] = "your-api-key" +``` + +## Instantiation + +```python +from langchain_scraperapi.tools import ScraperAPITool + +tool = ScraperAPITool() +``` + +## Invocation + +### Invoke directly with args + +```python +output = tool.invoke( + { + "url": "https://langchain.com", + "output_format": "markdown", + "render": True, + } +) +print(output) +``` + +## Features + +### 1. `ScraperAPITool` — browse any website + +Invoke the raw ScraperAPI endpoint and get HTML, rendered DOM, text, or markdown. + +**Invocation arguments:** + +* **`url`** **(required)** – target page URL +* **Optional (mirror ScraperAPI query params):** + * `output_format`: `"text"` | `"markdown"` (default returns raw HTML) + * `country_code`: e.g. `"us"`, `"de"` + * `device_type`: `"desktop"` | `"mobile"` + * `premium`: `bool` – use premium proxies + * `render`: `bool` – run JS before returning HTML + * `keep_headers`: `bool` – include response headers + +For the complete set of modifiers see the [ScraperAPI request-customisation docs](https://docs.scraperapi.com/python/making-requests/customizing-requests). + +```python +from langchain_scraperapi.tools import ScraperAPITool + +tool = ScraperAPITool() + +html_text = tool.invoke( + { + "url": "https://langchain.com", + "output_format": "markdown", + "render": True, + } +) +print(html_text[:300], "…") +``` + +### 2. `ScraperAPIGoogleSearchTool` — structured Google Search + +Structured SERP data via `/structured/google/search`. + +**Invocation arguments:** + +* **`query`** **(required)** – natural-language search string +* **Optional:** `country_code`, `tld`, `uule`, `hl`, `gl`, `ie`, `oe`, `start`, `num` +* `output_format`: `"json"` (default) or `"csv"` + +```python +from langchain_scraperapi.tools import ScraperAPIGoogleSearchTool + +google_search = ScraperAPIGoogleSearchTool() + +results = google_search.invoke( + { + "query": "what is langchain", + "num": 20, + "output_format": "json", + } +) +print(results) +``` + +### 3. `ScraperAPIAmazonSearchTool` — structured Amazon Search + +Structured product results via `/structured/amazon/search`. + +**Invocation arguments:** + +* **`query`** **(required)** – product search terms +* **Optional:** `country_code`, `tld`, `page` +* `output_format`: `"json"` (default) or `"csv"` + +```python +from langchain_scraperapi.tools import ScraperAPIAmazonSearchTool + +amazon_search = ScraperAPIAmazonSearchTool() + +products = amazon_search.invoke( + { + "query": "noise cancelling headphones", + "tld": "co.uk", + "page": 2, + } +) +print(products) +``` + +## Use within an agent + +Here is an example of using the tools in an AI agent. The `ScraperAPITool` gives the AI the ability to browse any website, summarize articles, and click on links to navigate between pages. + +```python +%pip install -qU langchain-openai +``` + +```python +import os + +from langchain.agents import AgentExecutor, create_tool_calling_agent +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_openai import ChatOpenAI +from langchain_scraperapi.tools import ScraperAPITool + +os.environ["SCRAPERAPI_API_KEY"] = "your-api-key" +os.environ["OPENAI_API_KEY"] = "your-api-key" + +tools = [ScraperAPITool(output_format="markdown")] +llm = ChatOpenAI(model_name="gpt-4o", temperature=0) + +prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a helpful assistant that can browse websites for users. When asked to browse a website or a link, do so with the ScraperAPITool, then provide information based on the website based on the user's needs.", + ), + ("human", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), + ] +) + +agent = create_tool_calling_agent(llm, tools, prompt) +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) +response = agent_executor.invoke( + {"input": "can you browse hacker news and summarize the first website"} +) +``` + +## API reference + +Below you can find more information on additional parameters to the tools to customize your requests: + +* [ScraperAPITool](https://docs.scraperapi.com/python/making-requests/customizing-requests) +* [ScraperAPIGoogleSearchTool](https://docs.scraperapi.com/python/make-requests-with-scraperapi-in-python/scraperapi-structured-data-collection-in-python/google-serp-api-structured-data-in-python) +* [ScraperAPIAmazonSearchTool](https://docs.scraperapi.com/python/make-requests-with-scraperapi-in-python/scraperapi-structured-data-collection-in-python/amazon-search-api-structured-data-in-python) + +The LangChain wrappers surface these parameters directly. From bdd4cb19b6a43539e8f092463e1e8833c25501c2 Mon Sep 17 00:00:00 2001 From: Chase Lean Date: Fri, 3 Oct 2025 11:33:28 +0200 Subject: [PATCH 2/2] Updates docs to use new agent format --- .../python/integrations/tools/scraperapi.mdx | 29 +++++++------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/oss/python/integrations/tools/scraperapi.mdx b/src/oss/python/integrations/tools/scraperapi.mdx index a4b53ead1..0973dabc5 100644 --- a/src/oss/python/integrations/tools/scraperapi.mdx +++ b/src/oss/python/integrations/tools/scraperapi.mdx @@ -151,14 +151,13 @@ print(products) Here is an example of using the tools in an AI agent. The `ScraperAPITool` gives the AI the ability to browse any website, summarize articles, and click on links to navigate between pages. ```python -%pip install -qU langchain-openai +%pip install -qU langchain-openai langchain ``` ```python import os -from langchain.agents import AgentExecutor, create_tool_calling_agent -from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain.agents import create_agent from langchain_openai import ChatOpenAI from langchain_scraperapi.tools import ScraperAPITool @@ -166,24 +165,18 @@ os.environ["SCRAPERAPI_API_KEY"] = "your-api-key" os.environ["OPENAI_API_KEY"] = "your-api-key" tools = [ScraperAPITool(output_format="markdown")] -llm = ChatOpenAI(model_name="gpt-4o", temperature=0) - -prompt = ChatPromptTemplate.from_messages( - [ - ( - "system", - "You are a helpful assistant that can browse websites for users. When asked to browse a website or a link, do so with the ScraperAPITool, then provide information based on the website based on the user's needs.", - ), - ("human", "{input}"), - MessagesPlaceholder(variable_name="agent_scratchpad"), - ] +llm = ChatOpenAI(model="gpt-4o", temperature=0) + +agent = create_agent( + model=llm, + tools=tools, + prompt="You are a helpful assistant that can browse websites for users. When asked to browse a website or a link, do so with the ScraperAPITool, then provide information based on the website based on the user's needs.", ) -agent = create_tool_calling_agent(llm, tools, prompt) -agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) -response = agent_executor.invoke( - {"input": "can you browse hacker news and summarize the first website"} +response = agent.invoke( + {"messages": [{"role": "user", "content": "can you browse hacker news and summarize the first website"}]} ) +print(response["messages"][-1].content) ``` ## API reference