### Build an Autogen agent that can scrape a website for data and format the output.

#### Step 1: Import required packages and set required variables.

In [0]:
# import required packages
import autogen
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing_extensions import Annotated
from autogen import ConversableAgent, register_function

# Instead of using the autogen.config_list_from_json function to load a list of llms and their ppties from a json/text file, I'll just define an llm_config dictionary variable in the notebook.

llm_config = {
  "config_list": [
    {
      "model": dbutils.secrets.get(scope="myscope", key="aoai-deploymentname"),
      "api_key": dbutils.secrets.get(scope="myscope", key="aoai-api-key"),
      "base_url": dbutils.secrets.get(scope="myscope", key="aoai-endpoint"),
      "api_type": "azure",
      "api_version": "2024-02-15-preview"
    }
  ]
}

#### Step 2: Define a function that'll scrape content from a blog post using the apify client actor method.

In [0]:
def scrape_blog(url: Annotated[str, "The url of a blog post to scrape"]) -> Annotated[str, "The scraped content"]:
    """
    Load a document using the specified loader class and website URL.

    Args:
    loader_class (class): The class of the loader to be used.
    website_url (str): The URL of the website from which to load the document.

    Returns:
    str: The loaded document.
    """
    loader_class=WebBaseLoader
    loader = loader_class([url])
    return loader.load()[0].page_content

def chunk_text(text: str):
    pass
    recursive_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name=dbutils.secrets.get(scope="myscope", key="aoai-deploymentname"),
        chunk_size=600,
        chunk_overlap=125,
        separators=["\n\n", "\n", " ", ""]
    )

    recursive_text_splitter_chunks = recursive_text_splitter.split_text(text=text)
    return recursive_text_splitter_chunks

url = "https://chinnychukwudozie.com/2024/07/10/enhancing-document-extraction-with-azure-ai-document-intelligence-and-langchain-for-rag-workflows/"
blog_content = scrape_blog(url=url)
print(f"Character count: {len(blog_content)}")
print(f"---\nScraped blog text:\n---")
print(f" {blog_content}")

#### Step 3: Create the agents and register the function.

In [0]:
def is_termination_msg_def(x):
    content = x.get("content", "")
    if content is not None and "terminate" in content.lower():
        return True
    else:
        return False
    
is_termination_msg_def = is_termination_msg_def

In [0]:
# Create web scraper agent
scraper_agent = ConversableAgent(
    name="WebScraper",
    llm_config=llm_config,
    system_message="You are a web scrapper and you can scrape any web page using the tools provided."
    "Returns 'TERMINATE', when the scraping is done",
)

# Create user proxy agent
user_proxy_agent = ConversableAgent(
    name="User",
    llm_config=False,
    human_input_mode="NEVER",
    code_execution_config=False,
    default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.",
    is_termination_msg=lambda x: x.get("content", "") is not None
    and "terminate" in x["content"].lower(),
)

# Register the function with the agents
register_function(
    f=scrape_blog,
    caller=scraper_agent,
    executor=user_proxy_agent,
    name="scrape_blog",
    description="Scrape a blog post and return the content.",
)

#### Step 4: Start the conversation for scraping web data. We used the reflection_with_llm option for summary method to perform the formatting of the output into a desired format. The summary method is called after the conversation is completed given the complete history of the conversation.

In [0]:
chat_result = user_proxy_agent.initiate_chat(
    recipient=scraper_agent,
    message="Can you scrape chinnychukwudozie.com/2024/07/10/enhancing-document-extraction-with-azure-ai-document-intelligence-and-langchain-for-rag-workflows for me?",
    summary_method="reflection_with_llm",
    summary_args={
        "summary_prompt": """
                Summarize the scraped content and format summary EXACTLY as follows:
---
*Company name*:
`Acme Corp`
---
*Website*:
`acmecorp.com`
---
*Description*:
`Company that does things.`
---
*Tags*:
`Manufacturing. Retail. E-commerce.`
---
*Takeaways*:
`Provides shareholders with value by selling products.`
---
*Questions*:
`What products or services do they offer? If they make money, how? What is their market share?`
---                                                                                                                                          """
    },
)

In [0]:
print(chat_result.summary)