<a href="https://colab.research.google.com/github/jayy1809/Webscraper-CrewAI2/blob/main/webscraper_crew.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install crewai

In [None]:
!pip install crewai_tools

In [3]:
from crewai import Agent, Task, Crew

In [4]:
from crewai_tools import SerperDevTool, \
                         ScrapeWebsiteTool, \
                         WebsiteSearchTool

In [6]:
import os

In [9]:
import getpass

In [59]:
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4o'

In [10]:
os.environ["OPENAI_API_KEY"] = getpass.getpass('Enter your API key: ')

Enter your API key: ··········


In [11]:
os.environ["SERPER_API_KEY"] =  getpass.getpass('Enter your Serper API key: ')

Enter your Serper API key: ··········


In [60]:
search_tool = WebsiteSearchTool()

In [61]:
search_tool2 = SerperDevTool()

In [76]:
search_agent = Agent(
    role="Search Agent",
    goal="Find relevant URLs for suppliers of product from the web.",
    backstory="As the Search Agent, you are responsible for discovering potential suppliers by performing web searches. your primary task is to find and collect URLs that could contain information about chia seed suppliers.",
    allow_delegation=False,
    tools=[search_tool, search_tool2],
    verbose=True,
)

In [77]:
scraper_tool = ScrapeWebsiteTool()

In [78]:
scraper_agent = Agent(
    role="Scraper Agent",
    goal="Scrape data from the supplier URLs.",
    backstory="As the Scraper Agent, your role is to visit each URL provided by the Search Agent and scrape the necessary HTML content. This data will be used by the Data Extraction Agent to extract specific information about chia seeds suppliers.",
    allow_delegation=False,
    tools=[scraper_tool],
    verbose=True,
)

In [79]:
data_extraction_agent = Agent(
    role="Data Extraction Agent",
    goal="Extract supplier details from the scraped HTML content.",
    backstory="As the Data Extraction Agent your job is to do Retrieval augmented generation, you are tasked with parsing the HTML content provided by the Scraper Agent. your job is to extract specific details such as supplier name, email, phone number, price, and location.",
    allow_delegation=False,
    tools=[],
    verbose=True,
)

In [80]:
aggregator_agent = Agent(
    role="Aggregator Agent",
    goal="Aggregate the extracted supplier data into a structured format.",
    backstory="As the Aggregator Agent, you collect and organize the supplier data extracted by the Data Extraction Agent. your aim is to present this information in a structured and accessible format for further use.",
    allow_delegation=False,
    tools=[],
    verbose=True,
)

In [81]:
search_task = Task(
    description=f"Search the internet and find URLs for suppliers of chia seeds.",
    expected_output="""A list of URLs of supplier websites:
    [
        "http://supplier1.com",
        "http://supplier2.com",
        ...
    ]""",
    agent=search_agent,
    output_file="search_results.txt",
)
def search_info_task(product_name):
    return Task(
        description=f"Search the internet and find URLs for suppliers of {product_name}.",
        expected_output="""A list of URLs of supplier websites:
        [
            "http://supplier1.com",
            "http://supplier2.com",
            ...
        ]""",
        agent=search_agent,
        output_file="search_results.txt",
    )


In [82]:
scraper_task = Task(
    description="Scrape the HTML content from the supplier URLs.",
    expected_output="""A dictionary containing URL and its corresponding HTML content:
    {
        "http://supplier1.com": "<html>...</html>",
        "http://supplier2.com": "<html>...</html>",
        ...
    }""",
    agent=scraper_agent,
    output_file="scraped_data.txt",
)


In [83]:
data_extraction_task = Task(
    description="Extract supplier details (supplier name, email, phone number, price, location) from the scraped HTML content.",
    expected_output="""A dictionary containing the extracted supplier details will look something like this (this is an example):
    {
        "supplier_name": "Supplier 1",
        "email": "contact@supplier1.com",
        "phone_number": "+1234567890",
        "price": "$10 per kg",
        "location": "USA"
    }""",
    agent=data_extraction_agent,
    output_file="extracted_data.txt",
)


In [84]:
aggregator_task = Task(
    description="Aggregate the extracted supplier details into a structured format.",
    expected_output="""A list of dictionaries, each containing supplier details:
    [
        {
            "website URL" : "",
            "supplier_name": "",
            "email": "",
            "phone_number": "",
            "price": "",
            "location": ""
        },
        {
            "website URL" : "",
            "supplier_name": "",
            "email": "",
            "phone_number": "",
            "price": "",
            "location": ""
        },
        ...
    ]""",
    agent=aggregator_agent,
    output_file="aggregated_data.txt",
)


In [85]:
crew = Crew(
  agents=[search_agent,scraper_agent,data_extraction_agent,aggregator_agent],
  tasks=[search_task,scraper_task,data_extraction_task, aggregator_task],
  verbose=2,
  memory=True
)



In [72]:
product_name = input("Enter the product name: ")

Enter the product name: chia seeds


In [86]:
result = crew.kickoff()

[1m[95m [DEBUG]: == Working Agent: Search Agent[00m
[1m[95m [INFO]: == Starting Task: Search the internet and find URLs for suppliers of chia seeds.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mI need to start by performing a web search to find suppliers of chia seeds. My goal is to gather a list of URLs that lead to direct suppliers, ensuring they are credible and reliable. I will begin by searching for chia seed suppliers.

Action: Search the internet
Action Input: {"search_query": "chia seed suppliers"}[0m[95m 


Search results: Title: Chia Seeds - Grain Millers
Link: https://www.grainmillers.com/products/chia/
Snippet: Although we are an international supplier of grain-based ingredients, we are not an exporter or trader of raw agricultural commodities sold in vessels of 10,000 ...
---
Title: Wholesale Chia Seeds - HERBCO.com
Link: https://www.herbco.com/c-229-chia.aspx
Snippet: Buy top-quality chia seeds in bulk at Monterey Bay Herb Co. Perfect for n

In [58]:
from IPython.display import Markdown
Markdown(result)

[
    {
        "website URL": "https://www.grainmillers.com/products/chia/",
        "supplier_name": "Grain Millers",
        "email": "contact@grainmillers.com",
        "phone_number": "952.829.8821",
        "price": "$10 per kg",
        "location": "USA"
    },
    {
        "website URL": "https://uschia.com/",
        "supplier_name": "US Chia",
        "email": "",
        "phone_number": "",
        "price": "",
        "location": ""
    }
]

In [75]:
search_tool = SerperDevTool()
scraper_tool = ScrapeWebsiteTool()
data_extraction_tool = WebsiteSearchTool()

# Create the Research Agent
researcher = Agent(
    role="Supplier Research Specialist",
    goal="Find the best suppliers for the specified product.",
    backstory="You are an expert in market research with a focus on supplier analysis. Your goal is to find the best suppliers for a given product.",
    allow_delegation=False,
    tools=[search_tool, scraper_tool, data_extraction_tool],
    verbose=True,
)

# Task to search and compile supplier information
def supplier_info_task(product_name):
    return Task(
        description=f"Search the internet and find the best suppliers for {product_name}. Extract the supplier name, product name, price, email, and phone number.",
        expected_output="""A structured list of suppliers with the following details:
        website URL :
        Supplier Name:
        Product Name:
        Price:
        Email:
        Phone Number: """,
        agent=researcher,
        output_file="supplier_info.txt",
    )

# Create the Writer Agent
writer = Agent(
    role="Supplier Information Analyst",
    goal="Summarize supplier information into a concise report.",
    backstory="You are an expert in summarizing market research data into easily understandable reports for business use.",
    allow_delegation=False,
    verbose=True,
)

# Task to summarize the supplier information
task2 = Task(
    description="Summarize the supplier information into a bullet point list.",
    expected_output="A summarized bullet point list of each supplier, including their product names, prices, emails, and phone numbers.",
    agent=writer,
    output_file="summary_report.txt",
)

# Function to kick off the research process
def get_supplier_info(product_name):
    task1 = supplier_info_task(product_name)
    crew = Crew(agents=[researcher, writer], tasks=[task1, task2], verbose=2)
    result = crew.kickoff()
    return result

# Example usage
if __name__ == "__main__":
    product_name = input("Enter the product name: ")
    supplier_info = get_supplier_info(product_name)
    print(supplier_info)


Enter the product name: chia seeds




[1m[95m [DEBUG]: == Working Agent: Supplier Research Specialist[00m
[1m[95m [INFO]: == Starting Task: Search the internet and find the best suppliers for chia seeds. Extract the supplier name, product name, price, email, and phone number.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mTo find the best suppliers for chia seeds, I need to start by searching the internet for potential suppliers.

Action: Search the internet
Action Input: {"search_query": "best suppliers for chia seeds"}[0m[95m 


Search results: Title: 10 Best Chia Seeds Manufacturing Companies - Keychain.com
Link: https://www.keychain.com/manufacturer/chia-seeds
Snippet: For the top manufacturers of Chia Seeds , these certifications include FDA, Certified Organic, and SQF Food Safety. Qualitative factors such as reputation are ...
---
Title: Chia Seeds - Grain Millers
Link: https://www.grainmillers.com/products/chia/
Snippet: Chia Seeds. Seeds are a great source of healthy fats, vegetarian p