# Crawl4ai

In [70]:
# !pip install crawl4ai
# !crawl4ai-setup

## Crawling a single url

In [71]:
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode

async def crawl4ai_crawl(url: str):
    browser_conf = BrowserConfig(headless=True)  # Run in headless mode
    run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result = await crawler.arun(url=url, config=run_conf)

        if result.success:
            return result.markdown_v2.raw_markdown  # Return extracted content
        else:
            return f"Error: {result.error_message}"  # Handle errors gracefully

In [72]:
url_to_crawl = "https://www.utwente.nl/en/eemcs/dacs/news/2025/1/98282/jair-santanna-appointed-to-europols-european-cybercrime-centre-advisory-board"
crawled_text = asyncio.run(crawl4ai_crawl(url_to_crawl))

# print(crawled_text)

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.utwente.nl/en/eemcs/dacs/news/2025/1/9... | Status: True | Time: 0.58s
[SCRAPE].. ◆ Processed https://www.utwente.nl/en/eemcs/dacs/news/2025/1/9... | Time: 11ms
[COMPLETE] ● https://www.utwente.nl/en/eemcs/dacs/news/2025/1/9... | Status: True | Total: 0.60s


## Crawl in parallel!

In [73]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

async def crawl4ai_parallel(urls: list[str]):
    """Crawls multiple URLs in parallel and returns a dictionary of results."""
    
    run_conf = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        stream=False  # Get all results at once
    )

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun_many(urls, config=run_conf)
        
        crawled_data = {}  # Dictionary to store results
        
        for res in results:
            if res.success:
                crawled_data[res.url] = res.markdown_v2.raw_markdown
            else:
                crawled_data[res.url] = f"Error: {res.error_message}"  # Store error message

        return crawled_data  # Return the crawled content

In [74]:
urls_to_crawl = [
    "https://www.utwente.nl/en/eemcs/dacs/news/2025/1/98282/jair-santanna-appointed-to-europols-european-cybercrime-centre-advisory-board",
    "https://isaca.nl/people/jair-santanna/",
    "https://scholar.google.com/citations?user=TxcQNxUAAAAJ&hl=en"
]

crawled_results = asyncio.run(crawl4ai_parallel(urls_to_crawl))

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.utwente.nl/en/eemcs/dacs/news/2025/1/9... | Status: True | Time: 0.63s
[SCRAPE].. ◆ Processed https://www.utwente.nl/en/eemcs/dacs/news/2025/1/9... | Time: 11ms
[COMPLETE] ● https://www.utwente.nl/en/eemcs/dacs/news/2025/1/9... | Status: True | Total: 0.65s
[FETCH]... ↓ https://isaca.nl/people/jair-santanna/... | Status: True | Time: 0.79s
[SCRAPE].. ◆ Processed https://isaca.nl/people/jair-santanna/... | Time: 32ms
[COMPLETE] ● https://isaca.nl/people/jair-santanna/... | Status: True | Total: 0.83s
[FETCH]... ↓ https://scholar.google.com/citations?user=TxcQNxUA... | Status: True | Time: 0.85s
[SCRAPE].. ◆ Processed https://scholar.google.com/citations?user=TxcQNxUA... | Time: 19ms
[COMPLETE] ● https://scholar.google.com/citations?user=TxcQNxUA... | Status: True | Total: 0.88s


In [76]:
# print(crawled_results)