In [1]:
#| default_exp routes.crawler

# Crawl4AI Routes
> Default description (change me)

## Post Installation

Run post-installation setup
in terminal make sure to run

```terminal
crawl4ai-setup
```

Followed by Verifying your installation

```terminal
crawl4ai-doctor
```


In [2]:
#| export
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FilterChain, DomainFilter
from agent_mafia.client.ResponseGetData import ResponseGetDataCrawler

In [3]:
#| exporti
import os
from typing import Callable
from agent_mafia.client import MafiaError as amme
from agent_mafia.utils import convert as amcv

In [4]:
#| hide
import nbdev
from functools import partial
from agent_mafia.routes.storage import save_chunk_to_disk


In [5]:
#| export
default_browser_config = BrowserConfig(
    browser_type="chromium",
    headless=True,
    verbose=True,
    extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
)

In [6]:
#| export
class Crawler_Route_NotSuccess(amme.MafiaError):
    def __init__(self, message = None, exception = None):
        super().__init__(message = message, exception = exception)

In [7]:
# | export
async def scrape_url(
    url: str,
    session_id: str,
    browser_config: BrowserConfig = None,
    crawler_config: CrawlerRunConfig = None,
    storage_fn: Callable = None,
):

    browser_config = browser_config or default_browser_config

    res = None
    content = None

    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawler_config = crawler_config or CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                
            )

            res = await crawler.arun(
                url=url,
                config=crawler_config,
                session_id=session_id,
                timeout=15,
            )

            print(res)

    except NotImplementedError as e:
        raise Crawler_Route_NotSuccess(
            message="have you run create4ai-create and create4ai-doctor? in terminal",
            exception=e,
        )
    
    except Exception as e:
        raise Crawler_Route_NotSuccess(
            exception=e,
        ) from e

    if not res.success:
        raise Crawler_Route_NotSuccess(
            message=f"error crawling {url} - {res.error_message}"
        )

    rgd = ResponseGetDataCrawler.from_res(res)

    if storage_fn:
        storage_fn(
            data={
                "content": rgd.markdown or rgd.response,
                "source": session_id,
                "url": res.url,
            }
        )

    return rgd

In [8]:
# default_browser_config = BrowserConfig(
#     browser_type="chromium",
#     headless=True,
#     verbose=True,
#     extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
# )

res = await scrape_url(
    url = 'https://docs.tavily.com/sdk/python/quick-start',
    session_id = 'tavily_docs',
    browser_config = default_browser_config,
    storage_fn = partial(save_chunk_to_disk, output_path = '../../TEST/crawler_routes/scrape_tavily.md'),
)
res

[INIT].... → Crawl4AI 0.5.0.post8
[FETCH]... ↓ https://docs.tavily.com/sdk/python/quick-start... | Status: True | Time: 6.17s
[SCRAPE].. ◆ https://docs.tavily.com/sdk/python/quick-start... | Time: 0.075s
[COMPLETE] ● https://docs.tavily.com/sdk/python/quick-start... | Status: True | Total: 6.26s
CrawlResultContainer([CrawlResult(url='https://docs.tavily.com/sdk/python/quick-start', html='<!DOCTYPE html><html lang="en" class="js-focus-visible lg:[--scroll-mt:9.5rem]" data-js-focus-visible=""><head><meta charset="utf-8"><meta name="viewport" content="width=device-width"><link rel="apple-touch-icon" type="image/png" sizes="180x180" href="https://mintlify.s3-us-west-1.amazonaws.com/tavilyai/_generated/favicon/apple-touch-icon.png?v=3"><link rel="icon" type="image/png" sizes="32x32" href="https://mintlify.s3-us-west-1.amazonaws.com/tavilyai/_generated/favicon/favicon-32x32.png?v=3"><link rel="icon" type="image/png" sizes="16x16" href="https://mintlify.s3-us-west-1.amazonaws.com/tavilyai/_ge

ResponseGetDataCrawler(is_success=True, status=200, response='<div><main><div><div><div><div><div><div><div><div><a href="https://tavily.com/"><span>Tavily Docs home page</span><img alt="light logo" class="w-auto h-7 relative object-contain block dark:hidden" src="https://mintlify.s3.us-west-1.amazonaws.com/tavilyai/logo/light.svg"/><img alt="dark logo" class="w-auto h-7 relative object-contain hidden dark:block" src="https://mintlify.s3.us-west-1.amazonaws.com/tavilyai/logo/dark.svg"/></a></div><button><div><div>Search or ask...</div></div><span>Ctrl K</span></button><div><nav><ul><li><a href="mailto:support@tavily.com">Support</a></li><li><a href="https://app.tavily.com">Get an API key</a></li><li><a href="https://app.tavily.com"><div><span>Get an API key</span></div></a></li></ul></nav></div><div><button><span>Search...</span></button></div></div></div><div><button><span>Navigation</span></button><div><div><span>Python</span></div><div>Quickstart</div></div></div></div><div><div><a 

In [17]:
res.raw[0].markdown

'[Tavily Docs home page![light logo](https://mintlify.s3.us-west-1.amazonaws.com/tavilyai/logo/light.svg)![dark logo](https://mintlify.s3.us-west-1.amazonaws.com/tavilyai/logo/dark.svg)](https://tavily.com/)\nSearch or ask...\nCtrl K\n  * Support\n  * [Get an API key](https://app.tavily.com)\n  * [Get an API key](https://app.tavily.com)\n\n\nSearch...\nNavigation\nPython\nQuickstart\n[Home](https://docs.tavily.com/welcome)[Documentation](https://docs.tavily.com/documentation/about)[SDKs](https://docs.tavily.com/sdk/python/quick-start)[Examples](https://docs.tavily.com/examples/use-cases/data-enrichment)\n* [API Playground](https://app.tavily.com/playground)\n* [Community](https://community.tavily.com)\n* [Blog](https://blog.tavily.com)\n##### Python\n  * [Quickstart](https://docs.tavily.com/sdk/python/quick-start)\n  * [SDK Reference](https://docs.tavily.com/sdk/python/reference)\n\n\n##### JavaScript\n  * [Quickstart](https://docs.tavily.com/sdk/javascript/quick-start)\n  * [SDK Refer

In [None]:
# | export


async def crawl_urls(
    starting_url: str,
    session_id: str,
    output_folder: str,
    crawler_config: CrawlerRunConfig = None,
    browser_config: BrowserConfig = None,
    storage_fn: Callable = None,
    process_fn: Callable = None,
    delay_before_return_html: int = 3,
):
    browser_config = browser_config or default_browser_config
    try:

        results = []
        async with AsyncWebCrawler(config=browser_config) as crawler:
            async for res in await crawler.arun(
                starting_url,
                config=crawler_config,
                # timeout=15,
                magic = True,
                delay_before_return_html=delay_before_return_html,
                session_id=session_id,
            ):
                
                rgd = ResponseGetDataCrawler.from_res(res)

                output_path=f"{os.path.join(
                    output_folder, amcv.convert_url_file_name(rgd.url))}.md"
                
                print(output_path)

                if storage_fn:
                    storage_fn(
                        output_path = output_path,
                        data={
                            "content": rgd.markdown or rgd.response,
                            "source": session_id,
                            "url": rgd.url,
                        },
                    )

                if process_fn:
                    await process_fn(
                            rgd=rgd,
                            export_folder=output_folder,
                            source=session_id
                        )

                results.append(rgd)

        return results

    except NotImplementedError as e:
        raise Crawler_Route_NotSuccess(
            message="have you run create4ai-create and create4ai-doctor? in terminal",
            exception=e,
        )

    except Exception as e:
        raise Crawler_Route_NotSuccess(
            exception=e,
        ) from e

In [10]:
# domain_filter = DomainFilter(
#         allowed_domains=["api.slack.com"]
#     )


# config = CrawlerRunConfig(
#     cache_mode=CacheMode.ENABLED,
#     deep_crawl_strategy=BFSDeepCrawlStrategy(
#         max_depth=1,
#         filter_chain=FilterChain([domain_filter]),
#         include_external=False,
#     ),
#     stream=True,
#     verbose=True,
# )

# await crawl_urls(
#     starting_url = "https://api.slack.com/apis",
#     session_id = "slack_api_docs",
#     crawler_config= config,
#     storage_fn = partial(save_chunk_to_disk),
#     output_path = '../../TEST/crawler_routes/crawl/')

In [11]:
#| hide
nbdev.nbdev_export('./crawler.ipynb')