In [1]:
from IPython.display import clear_output

In [2]:
import sys
sys.path.append(r'C:/IT/crawl4AI/crawl_lib')

In [5]:
import asyncio
import json
import asyncio
from crawl4ai import *
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy

# Demo_basic_crawl

In [6]:
async def demo_basic_crawl():
    print("\n=== 1. Basic Web Crawbling ===")

    async with AsyncWebCrawler(config=BrowserConfig(
        viewport_height=800,
        viewport_width=1200,
        headless=True,
        verbose=True,
    )) as crawler:
        results: List[CrawlResult] = await crawler.arun(
            url = 'https://tuoitre.vn/ba-truong-my-lan-bi-tuyen-y-an-tu-hinh-20241203120659551.htm'
        )

        for i, result in enumerate(results):
            print(f"Result {i+1}")
            print(f"Success: {result.success}")
            if result.success:
                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
                print(f"100 first chars: {result.markdown.raw_markdown}...")

            else:
                print("Failed to crawl the URL")

# Demo_parallel_crawl

In [7]:
async def demo_parallel_crawl():
    print("\n=== 2. Parallel Crawling ===")

    urls = ['https://tuoitre.vn/ba-truong-my-lan-bi-tuyen-y-an-tu-hinh-20241203120659551.htm',
           'https://tuoitre.vn/lanh-dao-tp-hcm-dang-huong-tuong-niem-cac-anh-hung-liet-si-tai-con-dao-20250727105240659.htm',
           'https://tuoitre.vn/chung-cu-36-tang-tai-tp-hcm-nut-ho-o-tang-16-2025072710311524.htm']

    async with AsyncWebCrawler(config=BrowserConfig(
        viewport_height=800,
        viewport_width=1200,
        headless=True,
        verbose=True,
    )) as crawler:
        results: List[CrawlResult] = await crawler.arun_many(
            urls = urls
        ) 

        print(f"Crawled {len(results)} URLs in parallel:")
        for i, result in enumerate(results):
            print(
                f" {i + i}. {result.url} - {'Success' if result.success else 'Failed'} "
            
            )

# Demo_fit_markdown

In [8]:
async def demo_fit_markdown():
    print("\n=== 3. Fit MarkDown with LLM content Filter ===")

    async with AsyncWebCrawler() as crawler:
        result: List[CrawlResult] = await crawler.arun(
            url = 'https://ofac.treasury.gov/media/933656/download?inline',
            config=CrawlerRunConfig(
                markdown_generator=DefaultMarkdownGenerator(
                    content_filter=PruningContentFilter()
                )
            ),
        )

        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
        print(f"Fit: {len(result.markdown.fit_markdown)} chars")

        print(f"Fit markdown: {result.markdown.fit_markdown}")

# Demo_media_and_links

In [9]:
async def demo_media_and_links():
    print("\n=== 4. Media and links extraction ===")

    async with AsyncWebCrawler() as crawler:
        result: List[CrawlResult] = await crawler.arun(
            url = 'https://tuoitre.vn/ba-truong-my-lan-bi-tuyen-y-an-tu-hinh-20241203120659551.htm'
        )

        for i, result in enumerate(result):

            images = result.media.get("images", [])
            print(f"Found {len(images)} images")

            internal_links = result.links.get("internal", [])
            external_links = result.links.get("external", [])
            print(f"Found {len(internal_links)} internal links")
            print(f"Found {len(external_links)} external links")

            # with open("images.json", 'w') as f:
            #     json.dump(images, f, indent = 2)

            # with open("links.json", 'w') as f:
            #     json.dump(
            #         {"internal": internal_links, "external": external_links},
            #         f,
            #         indent = 2
            #     )

            for image in images[:3]:
                print(f"Image: {image['src']}")
            for link in internal_links[:3]:
                print(f"Internal links: {link['href']}")
            for link in external_links[:3]:
                print(f"External links: {link['href']}")

# Demo_pdf_crawl

In [6]:
async def demo_pdf_crawl():
    # Initialize the PDF crawler strategy
    pdf_crawler_strategy = PDFCrawlerStrategy()

    # PDFCrawlerStrategy is typically used in conjunction with PDFContentScrapingStrategy
    # The scraping strategy handles the actual PDF content extraction
    pdf_scraping_strategy = PDFContentScrapingStrategy()
    run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)

    async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
        # Example with a remote PDF URL
        pdf_url = "https://ofac.treasury.gov/media/933901/download?inline" # A public PDF from arXiv

        print(f"Attempting to process PDF: {pdf_url}")
        result = await crawler.arun(url=pdf_url, config=run_config)

        if result.success:
            print(f"Successfully processed PDF: {result.url}")
            print(f"Metadata Title: {result.metadata.get('title', 'N/A')}")
            # Further processing of result.markdown, result.media, etc.
            # would be done here, based on what PDFContentScrapingStrategy extracts.
            if result.markdown and hasattr(result.markdown, 'raw_markdown'):
                print(f"Extracted text: {result.markdown.raw_markdown}...")
            else:
                print("No markdown (text) content extracted.")
        else:
            print(f"Failed to process PDF: {result.error_message}")

In [13]:
await demo_pdf_crawl()

Attempting to process PDF: https://ofac.treasury.gov/media/933901/download?inline


Successfully processed PDF: https://ofac.treasury.gov/media/933901/download?inline
Metadata Title: None
Extracted text: **DEPARTMENT OF THE TREASURY**
WASHINGTON, D.C.
* * *
**OFFICE OF FOREIGN ASSETS CONTROL**
DETERMINATION PURSUANT TO SECTION 1(a)(ii) OF EXECUTIVE ORDER 140 71
* * *
Prohibition on Petroleum Services
* * *
Pursuant to sections 1(a)(ii), 1(b), and 5 of Execu tive Order (E.O.) 14071 of April 6, 2022 (“Prohibiting New Investment in and Certain Services to the Russian Federation in Response to Continued Russian Federation Aggression”) and 31 CFR § 587.802, and in consultation with the Department of State, I hereby determine that the prohibitions in section 1(a)(ii) of E.O. 14071 shall apply to the following categor y of services : petroleum services.
* * *
As a result, the following activities are prohibited, except to the extent provided by law, or unless licensed or otherwise authorized by the Office of Foreign Assets Control:
* * *
The exportation, reexportation, sale,

# Demo raw html and file

In [11]:
async def demo_raw_html_and_file():
    print("\n=== 6. Raw HTML and local_files")

    raw_html = """
    <html> <body>
        <h1> Sample Article </h1>
        <p> This is sample content for testing Crawl4AI's raw HTML processing. </p>
    </body></html>
    """

    file_path = Path("docs/examples/tmp/sample.html").absolute()

    file_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(file_path, "w") as f:
        f.write(raw_html)

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="raw: " + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        )
        print("Raw HTML processing:")
        print(f"Markdown: {result.markdown.raw_markdown[:50]}")

# Demo LLM Structured Extraction No Schema

In [12]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GEMINI_API_KEY_2")

In [13]:
async def demo_llm_structured_extraction_no_schema():

    print("\n=== 6. Extract Structured data with LLM ===")
    extraction_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider="gemini/gemini-2.5-flash",
            api_token=secret_value_0 
        ),
        instruction=(
            "This is link "
            "Extract all news items you can find. I want: title, source url."
        ),
        extract_type="schema",
        schema="{title: string, url: string}",

        extract_args={
            "temperature": 0.0,
            "max_tokens": 4096,
        },
        verbose=True
    )

    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)

    async with AsyncWebCrawler(config=BrowserConfig(
        viewport_height=800,
        viewport_width=1200,
        headless=True,
        verbose=True,
    )) as crawler:
        results: list[CrawlResult] = await crawler.arun(
            url='https://vnexpress.net/',
            config=config
        )

        for result in results:
            print(f"URL: {result.url}")
            print(f"Success: {result.success}")
            if result.success:
                try:
                    data = json.loads(result.extracted_content)
                    print(json.dumps(data, indent=2, ensure_ascii=False))
                except Exception as e:
                    print("Failed to parse extracted content:", e)
            else:
                print("Failed to extract structured data")



# Demo js interaction

In [14]:
async  def demo_js_interaction():
    print("\n=== 7. JavaScript Interaction === ")

    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:

        news_schema = {
            "name": "news",
            "baseSelector": "tr.athing",
            "fields":[
                {
                "name": "title",
                "selector": "span.titleline",
                "type": "text",
                }
            ],
        }

        results: List[CrawlResult] = await crawler.arun(
            url = 'https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions',
            config=CrawlerRunConfig(
                session_id="hn_session",
                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema)
            ),
        )

        news = []
        for result in results:
            if result.success:
                data = json.loads(result.extracted_content)
                news.extend(data)
                print(json.dumps(data, indent=2))
            else:
                print("Failed to extract structured data")
        print(f"Initial items: {len(news)}")

        more_config = CrawlerRunConfig(
            js_code="document.querySelector('a.morelink').click():",
            js_only=True,
            session_id="hn_session",
            extraction_strategy=JsonCssExtractionStrategy(
                schema=news_schema
            ),
        )

        result: List[CrawlResult] = await crawler.arun(
            url='https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions', config=more_config
        )

        for result in results:
            if result.success:
                data = json.loads(result.extracted_content)
                news.extend(data)
                print(json.dumps(data, indent=2))
            else:
                print("Failed to extract structured data")
        print(f"Total items: {len(news)}")

# Demo deep crawl

In [16]:
async def demo_deep_crawl():

    print("\n=== Deep Crawling")
    
    filter_chain = FilterChain([DomainFilter(allowed_domains=["ofac.treasury.gov"])])

    deep_crawl_strategy = BFSDeepCrawlStrategy(
        max_depth = 1, max_pages = 1000, filter_chain = filter_chain
    )
    async with AsyncWebCrawler() as crawler:
        results: List[CrawlResult] = await crawler.arun(
            url='https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions',
            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
        )

        print(f"Deep crawl returned {len(results)} pages:")
        for i, result in enumerate(results):
            depth = result.metadata.get("depth", "unknown")
            print(f"{i+1}. {result.url} (Depth: {depth})")

In [17]:
await demo_deep_crawl()


=== Deep Crawling


NotImplementedError: 

In [None]:
async def crawl_and_extract():
    filter_chain = FilterChain([DomainFilter(allowed_domains=["vnexpress.net"])])

    # Deep Crawling
    deep_crawl_strategy = BFSDeepCrawlStrategy(
        max_depth = 1, max_pages = 1000, filter_chain = filter_chain
    )

    # Fit markdown
    markdown_gen = DefaultMarkdownGenerator(
        content_filter=PruningContentFilter()
    )

    # LLM
    extraction_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider="gemini/gemini-2.5-flash",
            api_token="AIzaSyCwmGF3qjto6g03a_9am0Rc5L4HewDrysc"
        ),
        instruction=(
            "This is link "
            "Extract all news items you can find. I want: title, source url."
        ),
        extract_type="schema",
        schema="{title: string, url: string}",

        extract_args={
            "temperature": 0.0,
            "max_tokens": 4096,
        },
        verbose=True
    )

    # Combine 
    run_cfg = CrawlerRunConfig(
        deep_crawl_strategy=deep_crawl_strategy,
        markdown_generator=markdown_gen,
        # extraction_strategy=extraction_strategy
    )

    async with AsyncWebCrawler(config=BrowserConfig(
        viewport_height=800,
        viewport_width=1200,
        headless=True,
        verbose=True,
    )) as crawler:
        results: List[CrawlResult] = await crawler.arun(
            url='https://vnexpress.net/',
            config=run_cfg,
        )
        
        for res in results:
            print("URL:", res.url)
            print("Depth:", res.metadata.get("depth"))
            print("Extracted ok:", res.success)
            if res.success:
                # res.extracted_content là JSON string theo schema
                print(res.extracted_content)
            else:
                print("  — No data extracted")

In [15]:
# await demo_basic_crawl()
# await demo_parallel_crawl()
# await demo_fit_markdown()
# await demo_media_and_links()
# await demo_pdf_crawl()
# await demo_llm_structured_extraction_no_schema()
# await demo_raw_html_and_file()
# await demo_js_interaction()
# await demo_deep_crawl()
await crawl_and_extract()

NotImplementedError: 