# HTML PARSER USING OLLAMA 

- https://github.com/InsightEdge01/ScrapegraphAIOllamallama3/blob/main/app.py


- https://www.youtube.com/watch?v=2BTI3KIiGHU


- https://scrapegraph-doc.onrender.com/docs/Graphs/smart_scraper_graph

!pip install scrapegraphai

In [1]:
from scrapegraphai.graphs import SmartScraperGraph
import nest_asyncio  # Import nest_asyncio module for asynchronous operations
nest_asyncio.apply()  # Apply nest_asyncio to resolve any issues with asyncio event loop

# Configuration dictionary for the graph
graph_config = {
    "llm": {
        "model": "ollama/llama3",  # Specify the model for the llm
        "temperature": 0,  # Set temperature parameter for llm
        "format": "json",  # Specify the output format as JSON for Ollama
        "base_url": "http://localhost:11434",  # Set the base URL for Ollama
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",  # Specify the model for embeddings
        "base_url": "http://localhost:11434",  # Set the base URL for Ollama
    },
    "verbose": True,  # Enable verbose mode for debugging purposes
}

# Initialize SmartScraperGraph with prompt, source, and configuration
smart_scraper_graph = SmartScraperGraph(
    #prompt="List all the content",  # Set prompt for scraping
    prompt="DESCRIBE THE PAGE",
    # Source URL or HTML content to scrape
    #source="https://github.com/InsightEdge01",
    source="https://jpinzon.pyscriptapps.com/loan-calculator/latest/",  #"https://perinim.github.io/projects",
    config=graph_config  # Pass the graph configuration
)

In [2]:
# FIRST CASE - SIMPLE PAGE
prompt1="DESCRIBE THE PAGE"
source1="https://jpinzon.pyscriptapps.com/loan-calculator/latest/"

smart_scraper_graph = SmartScraperGraph(prompt=prompt1, source=source1, config=graph_config)

result = smart_scraper_graph.run()
result

--- Executing Fetch Node ---
--- (Fetching HTML from: https://jpinzon.pyscriptapps.com/loan-calculator/latest/) ---
--- Executing Parse Node ---
--- Executing RAG Node ---
--- (updated chunks metadata) ---
--- (tokens compressed and vector stored) ---
--- Executing GenerateAnswer Node ---
Processing chunks: 100%|██████████| 1/1 [01:03<00:00, 63.07s/it]


{'page_description': 'A loan calculator page with a menu to calculate loan amortization. The page allows users to enter information such as interest rate, loan amount, and monthly payment. It also displays results including the number of payments, years, months, total paid (principal and interest), and an amortization schedule.'}

In [3]:
# SECOND CASE - MORE DATA IN THE URL 
# This one one takes about 6 mins. It maybe cause the wikipedia page contains a lot of information 
prompt2="what country has hosted the Copa America More times?"
source2="https://en.wikipedia.org/wiki/Copa_Am%C3%A9rica"

smart_scraper_graph = SmartScraperGraph(prompt=prompt2, source=source2, config=graph_config)

result = smart_scraper_graph.run()
print(result)

--- Executing Fetch Node ---
--- (Fetching HTML from: https://en.wikipedia.org/wiki/Copa_Am%C3%A9rica) ---
--- Executing Parse Node ---
--- Executing RAG Node ---
--- (updated chunks metadata) ---
--- (tokens compressed and vector stored) ---
--- Executing GenerateAnswer Node ---
Processing chunks: 100%|██████████| 2/2 [00:00<00:00, 2937.19it/s]


{'answer': 'Chile'}


In [4]:
def html_llm_parser(prompt, source, config):
    smart_scraper_graph = SmartScraperGraph(prompt=prompt, source=source, config=config)
    result = smart_scraper_graph.run()
    return result

In [5]:
graph_config = {
    "llm": {
        "model": "ollama/llama3",  # Specify the model for the llm
        "temperature": 0,  # Set temperature parameter for llm
        "format": "json",  # Specify the output format as JSON for Ollama
        "base_url": "http://localhost:11434",  # Set the base URL for Ollama
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",  # Specify the model for embeddings
        "base_url": "http://localhost:11434",  # Set the base URL for Ollama
    },
    "verbose": False,  # Enable verbose mode for debugging purposes
    "headless" : True
}

url = "https://www.arlingtontx.gov/city_hall/departments/garbage_recycling/household_hazardous_waste/environmental_collection_center"
q = "What is the name, address and schedule of the recycling location in the page provided?",
rec_center = html_llm_parser(q, url, graph_config)
rec_center

{}