In [1]:

%load_ext autoreload
%autoreload 2
from IPython.display import display

import crawl4ai
print(crawl4ai.__version__.__version__)

0.4.248


In [3]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [4]:
import asyncio
from playwright.async_api import async_playwright

async def test_browser():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://example.com')
        print(f'Title: {await page.title()}')
        await browser.close()

asyncio.run(test_browser())

Title: Example Domain


In [5]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, CacheMode

async def simple_crawl():
    crawler_run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://www.naturalgasintel.com/", #https://www.kidocode.com/degrees/technology",
            config=crawler_run_config
        )
        print(result.markdown_v2.raw_markdown)  #[:500].replace("\n", " -- "))  # Print the first 500 characters


asyncio.run(simple_crawl())

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.naturalgasintel.com/... | Status: True | Time: 4.22s
[SCRAPE].. ◆ Processed https://www.naturalgasintel.com/... | Time: 509ms
[COMPLETE] ● https://www.naturalgasintel.com/... | Status: True | Total: 4.75s
[![Header top ad placement](https://servedbyadbutler.com/adserve/;ID=176144;size=728x90;setID=427594;type=img)](https://www.naturalgasintel.com/<https:/servedbyadbutler.com/go2/;ID=176144;size=728x90;setID=427594>)
## Daily Prices
Updated Feb 13, 2025
2/13/25
[ ANR SE 0.26  ](https://www.naturalgasintel.com/</data-snapshot/daily-gpi/SLAANRSE/>) [ ANR SW 0.18  ](https://www.naturalgasintel.com/</data-snapshot/daily-gpi/MCWANR/>) [ Agua Dulce 0.135  ](https://www.naturalgasintel.com/</data-snapshot/daily-gpi/STXAGUAD/>) [ Algonquin Citygate -2.575  ](https://www.naturalgasintel.com/</data-snapshot/daily-gpi/NEAALGCG/>) [ Algonquin Citygate (non-G) -2.575  ](https://www.naturalgasintel.com/</data-snapshot/daily-gpi/NEALGNG/>) [ Allia

In [6]:
async def crawl_dynamic_content():
    # You can use wait_for to wait for a condition to be met before returning the result
    # wait_for = """() => {
    #     return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
    # }"""

    # wait_for can be also just a css selector
    # wait_for = "article.tease-card:nth-child(10)"

    async with AsyncWebCrawler() as crawler:
        js_code = [
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
        ]
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            js_code=js_code,
            # wait_for=wait_for,
        )
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=config,

        )
        print(result.markdown_v2.raw_markdown)#[:500].replace("\n", " -- "))  # Print first 500 characters

asyncio.run(crawl_dynamic_content())

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.nbcnews.com/business... | Status: True | Time: 4.16s
[SCRAPE].. ◆ Processed https://www.nbcnews.com/business... | Time: 345ms
[COMPLETE] ● https://www.nbcnews.com/business... | Status: True | Total: 4.65s
IE 11 is not supported. For an optimal experience visit our site on another browser.
Skip to Content
[NBC News Logo](https://www.nbcnews.com/<https:/www.nbcnews.com>)
  * [Trump admin](https://www.nbcnews.com/<https:/www.nbcnews.com/politics/donald-trump/live-blog/trump-linda-mcmahon-senate-hearing-live-updates-rcna191173>)
  * [Politics](https://www.nbcnews.com/<https:/www.nbcnews.com/politics>)
  * Local
  * [New York](https://www.nbcnews.com/<https:/www.nbcnews.com/new-york>)
  * [Los Angeles](https://www.nbcnews.com/<https:/www.nbcnews.com/los-angeles>)
  * [Chicago](https://www.nbcnews.com/<https:/www.nbcnews.com/chicago>)
  * [Dallas-Fort Worth](https://www.nbcnews.com/<https:/www.nbcnews.com/dallas-fort-worth>)
  * [Philade

In [7]:
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

async def clean_content():
    async with AsyncWebCrawler(verbose=True) as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            excluded_tags=['nav', 'footer', 'aside'],
            remove_overlay_elements=True,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
                options={
                    "ignore_links": True
                }
            ),
        )
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/Apple",
            config=config,
        )
        full_markdown_length = len(result.markdown_v2.raw_markdown)
        fit_markdown_length = len(result.markdown_v2.fit_markdown)
        print(f"Full Markdown Length: {full_markdown_length}")
        print(f"Fit Markdown Length: {fit_markdown_length}")

        print(result.markdown_v2.raw_markdown)

asyncio.run(clean_content())

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://en.wikipedia.org/wiki/Apple... | Status: True | Time: 2.79s
[SCRAPE].. ◆ Processed https://en.wikipedia.org/wiki/Apple... | Time: 3257ms
[COMPLETE] ● https://en.wikipedia.org/wiki/Apple... | Status: True | Total: 6.15s
Full Markdown Length: 89369
Fit Markdown Length: 73113
Jump to content
![](https://en.wikipedia.org/static/images/icons/wikipedia.png) ![Wikipedia](https://en.wikipedia.org/static/images/mobile/copyright/wikipedia-wordmark-en.svg) ![The Free Encyclopedia](https://en.wikipedia.org/static/images/mobile/copyright/wikipedia-tagline-en.svg)
Search
Search
Photograph your local culture, help Wikipedia and win!
![Hide](https://upload.wikimedia.org/wikipedia/commons/2/20/CloseWindow19x19.png)
# Apple
192 languages
  * Afrikaans
  * Alemannisch
  * አማርኛ
  * Ænglisc
  * العربية
  * Aragonés
  * ܐܪܡܝܐ
  * Արեւմտահայերէն
  * Armãneashti
  * অসমীয়া
  * Asturianu
  * Atikamekw
  * अवधी
  * Avañe'ẽ
  * Azərbaycanca
  * تۆرکجه
  * Basa 

In [8]:
async def link_analysis():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            exclude_external_links=True,
            exclude_social_media_links=True,
            # exclude_domains=["facebook.com", "twitter.com"]
        )
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=config,
        )
        print(f"Found {len(result.links['internal'])} internal links")
        print(f"Found {len(result.links['external'])} external links")

        for link in result.links['internal'][:5]:
            print(f"Href: {link['href']}\nText: {link['text']}\n")


asyncio.run(link_analysis())

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.nbcnews.com/business... | Status: True | Time: 2.08s
[COMPLETE] ● https://www.nbcnews.com/business... | Status: True | Total: 2.10s
Found 144 internal links
Found 24 external links
Href: https://www.nbcnews.com
Text: NBC News Logo

Href: https://www.nbcnews.com/politics/donald-trump/live-blog/trump-linda-mcmahon-senate-hearing-live-updates-rcna191173
Text: Trump admin

Href: https://www.nbcnews.com/politics
Text: Politics

Href: https://www.nbcnews.com/new-york
Text: New York

Href: https://www.nbcnews.com/los-angeles
Text: Los Angeles



In [9]:
async def media_handling():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            cache_mode=CacheMode.ENABLED,
            exclude_external_images=False,
            # screenshot=True # Set this to True if you want to take a screenshot
        )
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=config,
        )
        for img in result.media['images'][:5]:
            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")

asyncio.run(media_handling())

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.nbcnews.com/business... | Status: True | Time: 0.19s
[COMPLETE] ● https://www.nbcnews.com/business... | Status: True | Total: 0.22s
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-762x508,f_auto,q_auto:best/rockcms/2025-02/250213-egg-prices-mn-1500-eec9cb.jpg, Alt: A customer shops for eggs , Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2025-02/250213-Trump-aa-1259-11749a.jpg, Alt: President Donald Trump speaks in the Oval Office at the White House on Feb.12, 2025., Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2025-02/250213-credit-card-stock-aa-401-b35ebc.jpg, Alt: A person pays with a credit card., Score: 6
Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2025-02/250213-eggs-texas-ew-355p-6e184d.jpg, Alt: Image: First Inflation Report 

In [12]:
from playwright.async_api import Page, BrowserContext
async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
        """Hook called before navigating to each URL"""
        print(f"[HOOK] before_goto - About to visit: {url}")
        # Example: Add custom headers for the request
        await page.set_extra_http_headers({
            "Custom-Header": "my-value"
        })
        return page

async def custom_hook_workflow(verbose=True):
    async with AsyncWebCrawler(config=BrowserConfig( verbose=verbose)) as crawler:
        # Set a 'before_goto' hook to run custom code just before navigation
        crawler.crawler_strategy.set_hook("before_goto", before_goto)

        # Perform the crawl operation
        result = await crawler.arun(
            url="https://crawl4ai.com",
            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        )
        print(result.markdown_v2.raw_markdown)#[:500].replace("\n", " -- "))

asyncio.run(custom_hook_workflow())

[INIT].... → Crawl4AI 0.4.248
[HOOK] before_goto - About to visit: https://crawl4ai.com
[FETCH]... ↓ https://crawl4ai.com... | Status: True | Time: 3.34s
[SCRAPE].. ◆ Processed https://crawl4ai.com... | Time: 72ms
[COMPLETE] ● https://crawl4ai.com... | Status: True | Total: 3.45s
[Crawl4AI Documentation (v0.4.3bx)](https://crawl4ai.com/<https:/docs.crawl4ai.com/>)
  * [ Home ](https://crawl4ai.com/<.>)
  * [ Quick Start ](https://crawl4ai.com/<core/quickstart/>)
  * [ Search ](https://crawl4ai.com/<#>)


  * Home
  * Setup & Installation
    * [Installation](https://crawl4ai.com/<core/installation/>)
    * [Docker Deployment](https://crawl4ai.com/<core/docker-deploymeny/>)
  * [Quick Start](https://crawl4ai.com/<core/quickstart/>)
  * Blog & Changelog
    * [Blog Home](https://crawl4ai.com/<blog/>)
    * [Changelog](https://crawl4ai.com/<https:/github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md>)
  * Core
    * [Simple Crawling](https://crawl4ai.com/<core/simple-crawling/>)
    * [Cr

In [13]:
from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
)
import json

async def crawl_dynamic_content_pages_method_2():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")

    async with AsyncWebCrawler() as crawler:
        url = "https://github.com/microsoft/TypeScript/commits/main"
        session_id = "typescript_commits_session"
        all_commits = []
        last_commit = ""

        js_next_page_and_wait = """
        (async () => {
            const getCurrentCommit = () => {
                const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
                return commits.length > 0 ? commits[0].textContent.trim() : null;
            };

            const initialCommit = getCurrentCommit();
            const button = document.querySelector('a[data-testid="pagination-next-button"]');
            if (button) button.click();

            // Poll for changes
            while (true) {
                await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
                const newCommit = getCurrentCommit();
                if (newCommit && newCommit !== initialCommit) {
                    break;
                }
            }
        })();
        """

        schema = {
            "name": "Commit Extractor",
            "baseSelector": "li.Box-sc-g0xbh4-0",
            "fields": [
                {
                    "name": "title",
                    "selector": "h4.markdown-title",
                    "type": "text",
                    "transform": "strip",
                },
            ],
        }
        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

        for page in range(2):  # Crawl 2 pages
            config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                extraction_strategy=extraction_strategy,
                js_code=js_next_page_and_wait if page > 0 else None,
                js_only=page > 0,
            )
            result = await crawler.arun(
                url=url,
                config=config
            )

            assert result.success, f"Failed to crawl page {page + 1}"

            commits = json.loads(result.extracted_content)
            all_commits.extend(commits)

            print(f"Page {page + 1}: Found {len(commits)} commits")

        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

asyncio.run(crawl_dynamic_content_pages_method_2())


--- Advanced Multi-Page Crawling with JavaScript Execution ---
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/microsoft/TypeScript/commits/ma... | Status: True | Time: 2.34s
[SCRAPE].. ◆ Processed https://github.com/microsoft/TypeScript/commits/ma... | Time: 231ms
[EXTRACT]. ■ Completed for https://github.com/microsoft/TypeScript/commits/ma... | Time: 0.16967160000058357s
[COMPLETE] ● https://github.com/microsoft/TypeScript/commits/ma... | Status: True | Total: 2.79s
Page 1: Found 0 commits
[ERROR]... × Error updating image dimensions: Page.evaluate: Execution context was destroyed, most likely because of a navigation
[FETCH]... ↓ https://github.com/microsoft/TypeScript/commits/ma... | Status: True | Time: 1.31s
[SCRAPE].. ◆ Processed https://github.com/microsoft/TypeScript/commits/ma... | Time: 138ms
[EXTRACT]. ■ Completed for https://github.com/microsoft/TypeScript/commits/ma... | Time: 0.17855520000011893s
[COMPLETE] ● https://github.com/microsoft/TypeScript/commits/

In [14]:
from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
)
import json
async def extract():
    schema = {
        "name": "KidoCode Courses",
        "baseSelector": "section.charge-methodology .div-block-214.p-extraxx",
        "fields": [
            {
                "name": "section_title",
                "selector": "h3.heading-50",
                "type": "text",
            },
            {
                "name": "section_description",
                "selector": ".charge-content",
                "type": "text",
            },
            {
                "name": "course_name",
                "selector": ".text-block-93",
                "type": "text",
            },
            {
                "name": "course_description",
                "selector": ".course-content-text",
                "type": "text",
            },
            {
                "name": "course_icon",
                "selector": ".image-92",
                "type": "attribute",
                "attribute": "src"
            }
        ]
    }

    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

    async with AsyncWebCrawler() as crawler:

        # Create the JavaScript that handles clicking multiple times
        js_click_tabs = """
        (async () => {
            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");

            for(let tab of tabs) {
                // scroll to the tab
                tab.scrollIntoView();
                tab.click();
                // Wait for content to load and animations to complete
                await new Promise(r => setTimeout(r, 500));
            }
        })();
        """

        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            extraction_strategy=extraction_strategy,
            js_code=[js_click_tabs],
        )
        result = await crawler.arun(
            url="https://www.kidocode.com/degrees/technology",
            config=config
        )

        courses = json.loads(result.extracted_content)
        print(result.extracted_content)
        print(f"Successfully extracted {len(courses)} courses")
        print(len(result.markdown))
        # print(json.dumps(courses[0], indent=2))

await extract()

[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://www.kidocode.com/degrees/technology... | Status: True | Time: 2.90s
[SCRAPE].. ◆ Processed https://www.kidocode.com/degrees/technology... | Time: 126ms
[EXTRACT]. ■ Completed for https://www.kidocode.com/degrees/technology... | Time: 0.17904020000059973s
[COMPLETE] ● https://www.kidocode.com/degrees/technology... | Status: True | Total: 3.27s
[
    {
        "section_title": "Coding",
        "section_description": "Introducing students to the world of programming through Python, emphasizing foundational coding concepts and creativity in solving problems through generative art and geometric patterns.",
        "course_name": "Coding with Python",
        "course_description": "Explore the exciting intersection of art and technology in Creative Coding with Python. This engaging course introduces young learners to Python programming, teaching them foundational coding concepts through the creation of generative art and geometric patterns.

In [None]:
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
import os, json
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(
        ..., description="Fee for output token for the OpenAI model."
    )

async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: dict = None):
    print(f"\n--- Extracting Structured Data with {provider} ---")

    # Skip if API token is missing (for providers that require it)
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    extra_args = {"extra_headers": extra_headers} if extra_headers else {}

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://openai.com/api/pricing/",
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                provider=provider,
                api_token=api_token,
                schema=OpenAIModelFee.schema(),
                extraction_type="schema",
                instruction="""Extract all model names along with fees for input and output tokens."
                "{model_name: 'GPT-4', input_fee: 'US$10.00 / 1M tokens', output_fee: 'US$30.00 / 1M tokens'}.""",
                **extra_args
            ),
            cache_mode = CacheMode.ENABLED
        )
        print(json.loads(result.extracted_content)[:5])

# Usage:
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
# await extract_structured_data_using_llm("ollama/llama3.2")
await extract_structured_data_using_llm("openai/gpt-4o-mini", os.getenv("OPENAI_API_KEY"))

In [28]:
from crawl4ai.extraction_strategy import CosineStrategy

async def cosine_similarity_extraction():
    async with AsyncWebCrawler() as crawler:
        strategy = CosineStrategy(
            word_count_threshold=10,
            max_dist=0.2, # Maximum distance between two words
            linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
            top_k=3, # Number of top keywords to extract
            sim_threshold=0.3, # Similarity threshold for clustering
            semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
            verbose=True
        )

        result = await crawler.arun(
            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
            extraction_strategy=strategy,
            cach_mode = CacheMode.ENABLED
        )
        all = json.loads(result.extracted_content)
        for d in all:   
            for k, v in d.items():
                print(f"{k}: {v}")

asyncio.run(cosine_similarity_extraction())

[INIT].... → Crawl4AI 0.4.248
[LOG] Loading Extraction Model for cuda device.
[LOG] Loading Multilabel Classifier for cuda device.
[LOG] Model loaded sentence-transformers/all-MiniLM-L6-v2, models/reuters, took 0.0950007438659668 seconds
[FETCH]... ↓ https://www.nbcnews.com/business/consumer/how-mcdo... | Status: True | Time: 0.04s
[COMPLETE] ● https://www.nbcnews.com/business/consumer/how-mcdo... | Status: True | Total: 0.09s
index: 1
tags: ['news_&_social_concern']
content: 
  * [Latest Stories](https://www.nbcnews.com/business/consumer/</latest-stories>)
  * [Trump admin](https://www.nbcnews.com/business/consumer/<https:/www.nbcnews.com/politics/donald-trump/live-blog/trump-linda-mcmahon-senate-hearing-live-updates-rcna191173>)
  * [Politics](https://www.nbcnews.com/business/consumer/<https:/www.nbcnews.com/politics>)
  * [U.S. News](https://www.nbcnews.com/business/consumer/<https:/www.nbcnews.com/us-news>)
  * [World](https://www.nbcnews.com/business/consumer/<https:/www.nbcnews.c