In [1]:
#| default_exp routes.crawler

# Crawl4AI Routes
> Default description (change me)

## Post Installation

Run post-installation setup
in terminal make sure to run

```terminal
crawl4ai-setup
```

Followed by Verifying your installation

```terminal
crawl4ai-doctor
```


In [2]:
#| export
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode

from agent_mafia.client.ResponseGetData import ResponseGetDataCrawler

In [3]:
#| exporti
from typing import Callable
from agent_mafia.client import MafiaError as amme

In [4]:
#| hide
import nbdev
from functools import partial
from agent_mafia.routes.storage import save_chunk_to_disk

In [5]:
#| export
default_browser_config = BrowserConfig(
    browser_type="chromium",
    headless=True,
    verbose=True,
    extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
)

In [6]:
#| export
class Crawler_Route_NotSuccess(amme.MafiaError):
    def __init__(self, message, exception = None):
        super().__init__(message = message, exception = exception)

In [7]:
#| export
async def scrape_url(
    url: str,
    session_id: str,
    browser_config: BrowserConfig = None,
    storage_fn: Callable = None,
):

    browser_config = browser_config or default_browser_config

    res = None
    content = None

    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

            res = await crawler.arun(
                url=url,
                config=crawl_config,
                session_id=session_id,
                timeout=15,
            )


    except NotImplementedError as e:
        raise Crawler_Route_NotSuccess(message = "have you run create4ai-create and create4ai-doctor? in terminal", exception = e)

    if not res.success:
        raise Crawler_Route_NotSuccess(message = f"error crawling {url} - {res.error_message}")
    
    rgd = ResponseGetDataCrawler.from_res(res)

    if storage_fn:
        storage_fn(
            data = {
                'content' : res.markdown,
                'source' : session_id,
                'url' : url,})

    return rgd

In [None]:

res = await scrape_url(
    url = 'https://docs.tavily.com/sdk/python/quick-start',
    session_id = 'tavily_docs',
    browser_config = default_browser_config,
    storage_fn = partial(save_chunk_to_disk, output_path = '../../TEST/crawler_routes/scrape_tavily.md')
)
res

[INIT].... → Crawl4AI 0.5.0.post4


In [None]:
from pprint import pprint

pprint(res[0].__dict__)

{'cleaned_html': '<div><main><div><div><div><div><div><div><div><div><a '
                 'href="https://tavily.com/"><span>Tavily Docs home '
                 'page</span><img alt="light logo" class="w-auto h-7 relative '
                 'object-contain block dark:hidden" '
                 'src="https://mintlify.s3.us-west-1.amazonaws.com/tavilyai/logo/light.svg"/><img '
                 'alt="dark logo" class="w-auto h-7 relative object-contain '
                 'hidden dark:block" '
                 'src="https://mintlify.s3.us-west-1.amazonaws.com/tavilyai/logo/dark.svg"/></a></div><button><div><div>Search '
                 'or ask...</div></div><span>Ctrl '
                 'K</span></button><div><nav><ul><li><a '
                 'href="mailto:support@tavily.com">Support</a></li><li><a '
                 'href="https://app.tavily.com">Get an API key</a></li><li><a '
                 'href="https://app.tavily.com"><div><span>Get an API '
                 'key</span></div></a>

In [None]:
#| hide
nbdev.nbdev_export('./crawler.ipynb')