In [6]:
from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    LLMConfig,
)
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from typing import List, TypedDict
from pydantic import BaseModel, Field, create_model
import json
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import nest_asyncio
import asyncio
import os

nest_asyncio.apply()
load_dotenv(dotenv_path=".env")

True

In [None]:
class Product(BaseModel):
    product_name: str = Field("..", description="제품의 이름")
    price: str = Field("..", description="제품의 가격")


async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")
        ),
        schema=Product.model_json_schema(),  # Or use model_json_schema()
        extraction_type="schema",
        instruction="Extract all product objects with 'name' and 'price' from the content.",
        chunk_token_threshold=1000,
        overlap_rate=0.0,
        apply_chunking=True,
        input_format="markdown",  # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 800},
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy, cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(headless=True)

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://www.coupang.com/np/categories/420186", config=crawl_config
        )

        if result.success:
            # 5. The extracted content is presumably JSON
            data = json.loads(result.extracted_content)
            print("Extracted items:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)


if __name__ == "__main__":
    asyncio.run(main())

/var/folders/3g/rzddny917kq4pymbw_ts10f00000gn/T/ipykernel_2829/2490078773.py:12: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  schema=Product.schema_json(),  # Or use model_json_schema()


[INIT].... → Crawl4AI 0.5.0.post4
[FETCH]... ↓ https://www.coupang.com/np/categories/420186... | Status: True | Time: 3.95s
[SCRAPE].. ◆ https://www.coupang.com/np/categories/420186... | Time: 0.377s
[EXTRACT]. ■ Completed for https://www.coupang.com/np/categories/420186... | Time: 32.43773270800011s
[COMPLETE] ● https://www.coupang.com/np/categories/420186... | Status: True | Total: 36.77s
Extracted items: [{'product_name': '보수용품', 'price': '..'}, {'product_name': '공구/자재', 'price': '..'}, {'product_name': '인테리어소품', 'price': '..'}, {'product_name': '인테리어도서', 'price': '..'}, {'product_name': '건전지/멀티탭', 'price': '..'}, {'product_name': '공구/철물/DIY', 'price': '..'}, {'product_name': '전동/정밀공구', 'price': '..'}, {'product_name': '수공구/절단도구', 'price': '..'}, {'product_name': '공구함', 'price': '..'}, {'product_name': '측정/측량도구', 'price': '..'}, {'product_name': '배관/건축자재', 'price': '..'}, {'product_name': '사다리/운반용품', 'price': '..'}, {'product_name': '모터/에어공구', 'price': '..'}, {'product_name': '나사/못/

In [None]:
# 필요한 플로우
# 1. 사용자가 URL을 넣는다.
# 2. 사용자가 URL을 넣으면, 관련 URL을의 html tag들을 가져온다. 가져온 URL을 document에 넣나?
# 3. 사용자가 특정 부분을 요청하면 관련 부분을 가져오는 코드를 작성한다.
# 4. 코드를 작성하면 그것을 실행시켜서 관련 부분을 가져온다.