In [1]:
from typing import Type
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain.utilities import WikipediaAPIWrapper
import requests
from bs4 import BeautifulSoup


llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
)


class WikipediaSearchToolArgs(BaseModel):
    query: str = Field(
        description="The topic you want to research on Wikipedia, e.g. 'XZ backdoor'"
    )


class WikipediaSearchTool(BaseTool):
    name = "WikipediaSearchTool"
    description = """
    Use this tool to search for a topic on Wikipedia and return the main article content.
    It will search Wikipedia and return:
    - the title
    - the URL
    - the main text content
    """
    args_schema: Type[WikipediaSearchToolArgs] = WikipediaSearchToolArgs

    def _run(self, query):
        wiki = WikipediaAPIWrapper()
        docs = wiki.load(query)

        if not docs:
            return "No Wikipedia article found."

        doc = docs[0]

        title = doc.metadata["title"]
        url = doc.metadata["source"]
        content = doc.page_content[:10000]

        return f"Title: {title}\nURL: {url}\nCONTENT:\n{content}"


class WebScraperToolArgs(BaseModel):
    url: str = Field(
        description="URL of the web page to scrape text from."
    )


class WebScraperTool(BaseTool):
    name = "WebScraperTool"
    description = """
    Use this tool to scrape plain text from a web page given its URL.
    It uses HTTP GET + HTML parsing.
    """
    args_schema: Type[WebScraperToolArgs] = WebScraperToolArgs

    def _run(self, url):
        response = requests.get(url, headers={"User-Agent": "Mozilla"})
        html = response.text

        soup = BeautifulSoup(html, "html.parser")
        
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        text = soup.get_text(" ")
        text = " ".join(text.split())

        return text[:10000]


class SaveToFileToolArgs(BaseModel):
    filename: str = Field(
        description="File name for saving research, e.g. 'xz_backdoor_research.txt'"
    )
    content: str = Field(
        description="The research notes or extracted content to save."
    )


class SaveToFileTool(BaseTool):
    name = "SaveToFileTool"
    description = """
    Use this tool to save research results into a .txt file.
    """
    args_schema: Type[SaveToFileToolArgs] = SaveToFileToolArgs

    def _run(self, filename, content):
        file = open(filename, "w", encoding="utf-8")
        file.write(content)
        file.close()

        return f"Research saved to {filename}"


agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True,
    tools= [
        WikipediaSearchTool(),
        WebScraperTool(),
        SaveToFileTool(),
    ],
)


query = "Research about the XZ backdoor"

prompt = f"""
Research this topic: {query}

Steps:
1. Find the Wikipedia page with WikipediaSearchTool.
2. Get the URL, scrape the page using WebScraperTool.
3. Write a research report.
4. Save it with SaveToFileTool.
"""

result = agent.invoke(prompt)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `WikipediaSearchTool` with `{'query': 'XZ backdoor'}`


[0m[36;1m[1;3mTitle: XZ Utils backdoor
URL: https://en.wikipedia.org/wiki/XZ_Utils_backdoor
CONTENT:
In February 2024, a malicious backdoor was introduced to the Linux build of the xz utility within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name "Jia Tan". The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution through OpenSSH on the affected Linux system. The issue has been given the Common Vulnerabilities and Exposures number CVE-2024-3094 and has been assigned a CVSS score of 10.0, the highest possible score.
While xz is commonly present in most Linux distributions, at the time of discovery the backdoored version had not yet been widely deployed to production systems, but was present in development versions of major distributions. The backdoor was discovered by the software developer A