# ClaudeCrawl example

- Claude 를 이용하여 [Firecrawl](https://firecrawl.dev) 와 비슷하게 동작하도록 하는 ClaudeCrawl 구현
- DOM 구조를 몰라도 의미적으로 필요한 정보를 가져올 수 있다.

## Install Deps

In [69]:
!pip -q install -U requests langchain langchain_aws

In [1]:
import os
import re

import requests
from bs4 import BeautifulSoup, Tag
from pydantic import BaseModel, Field

from langchain_core.prompts import ChatPromptTemplate
from langchain_aws.chat_models import ChatBedrockConverse

In [105]:
model_id = "us.anthropic.claude-3-5-haiku-20241022-v1:0"
# model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
aws_profile_name = None
temperature = 0.1
max_tokens = 1024 * 2

In [106]:
llm = ChatBedrockConverse(
    model=model_id,
    credentials_profile_name=aws_profile_name,
    temperature=temperature,
    max_tokens=max_tokens,
)

In [107]:
class Article(BaseModel):
    """Tactic article schema, contains the details of the how to play to win with a specific LOL champion"""

    title: str = Field("the title of the article")
    url: str = Field("the url link to the article")
    season: int = Field("the LOL season number for the article")
    published_at: str = Field("published date of the article, RFC 3339 format")

class ExtractSchema(BaseModel):
    """Schema to extract the articles for the tactics from the page"""

    articles: list[Article] = Field("list of the Tactic Article objects in the page")

In [108]:
llm_with_output = llm.with_structured_output(ExtractSchema)

In [109]:
url = f"https://lol.inven.co.kr/dataninfo/champion/manualTool.php?confirm=2"
url

'https://lol.inven.co.kr/dataninfo/champion/manualTool.php?confirm=2'

In [110]:
SYSTEM_PROMPT = """
You are tasked with scraping information from a web page and extracting specific details based on a given output schema. 
Your task is to carefully read and analyze the content of this web page, and then extract information according to the provided output schema.

## Instruction

1. To begin, thoroughly read and analyze the entire web page. \
Pay attention to all sections, including headers, paragraphs, lists, tables, and any other relevant elements. \
Take note of the overall structure and organization of the content.
2. As you analyze the page, identify information that matches the fields specified in the output schema. \
Be thorough and precise in your extraction.

## HTML Analysis
- Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field.
- Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
- Note any nested structures or relationships between elements that are relevant to the data extraction task.
- Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
- Recommend the specific strategy to use for scraping the content, remeber.

## Data Analysis
- List out all the links in the page, to make a group by their similarity.
- Meaningful data has a tendency to be around a link url, such as `a` tag.
- Article links tends to have similar link url, `href` prop, which out numbers the most of the links in the page.

## Link Extraction
- Do not create any of links, if the content has no link for the schema. \
In that case, just respond with empty string.

Begin your scraping process now, and provide the extracted information in the format specified above.
Let's think step by step.
""".strip()

In [114]:
def flatten_html(soup):
    """
    HTML에서 불필요하게 중첩된 태그들을 정리하는 함수
    
    Args:
        html_content (str): 정리할 HTML 문자열
    
    Returns:
        str: 정리된 HTML 문자열
    """
    def should_remove_tag(tag):
        # 태그가 제거되어야 하는 조건
        # 1. 내용이 비어있거나 공백뿐인 경우
        # 2. 자식 노드가 하나뿐이고 같은 태그인 경우
        if not tag.contents:
            return True
            
        if len(tag.contents) == 1:
            child = tag.contents[0]
            if isinstance(child, type(tag)) and child.name == tag.name:
                return True
                
        text_content = tag.get_text(strip=True)
        if not text_content and len(tag.find_all()) == 0:
            return True
            
        return False

    def clean_tag(tag):
        # 재귀적으로 모든 자식 태그들을 정리
        for child in tag.find_all(recursive=False):
            clean_tag(child)
        
        if should_remove_tag(tag):
            tag.unwrap()
    
    # 모든 불필요한 공백 제거
    for element in soup(text=lambda text: isinstance(text, str)):
        if element.strip() == '':
            element.extract()
    
    # 중첩된 태그 정리
    clean_tag(soup)
    
    # 정리된 HTML 반환
    return str(soup)


def extract(html:str):
    print("extract outputs...")
    prompts = ChatPromptTemplate([
        ("system", SYSTEM_PROMPT),
        ("human", "Here is the web page content.\n```html\n{html}\n```\n\nPlease extract all the champion tactic articles from the page, 30 articles are placed in the table tag."),
    ]).invoke({ "html": html })
    return llm_with_output.invoke(prompts)


def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 완전히 제거할 태그들
    tags_to_remove = [
        'head',
        'script',
        'style',
        'noscript',
        'svg',
        'meta',
        'iframe'
        'link',
        'video',
        'audio',
    ]

    # 유지할 속성들
    keep_attributes = {
        'a': ['href', 'title'],
        'div': ['id'],
        'span': ['id']
    }
    
    # 제거할 태그들 처리
    for tag in tags_to_remove:
        for element in soup.find_all(tag):
            element.decompose()
    
    # 모든 요소를 순회하면서 불필요한 속성 제거
    for element in soup.find_all():
        if element.name in keep_attributes:
            # 해당 태그에 대해 유지할 속성 목록
            allowed_attrs = keep_attributes[element.name]
            # 현재 속성들 중 유지할 속성만 필터링
            element.attrs = {k: v for k, v in element.attrs.items() if k in allowed_attrs}
        else:
            # keep_attributes에 정의되지 않은 태그는 모든 속성 제거
            element.attrs = {}
        
        # 공백 문자열 정리
        if element.string:
            element.string = ' '.join(element.string.split())

    return soup


def scrape_url(url):
    print(f"scrape url: {url}...")
    downloaded = requests.get(url, timeout=5).content.decode('utf-8')
    cleaned_soup = clean_html(downloaded)
    flat_html = flatten_html(cleaned_soup)
    return extract(flat_html)

In [115]:
%%time

response = scrape_url(url)

scrape url: https://lol.inven.co.kr/dataninfo/champion/manualTool.php?confirm=2...
extract outputs...
CPU times: user 232 ms, sys: 8.58 ms, total: 241 ms
Wall time: 38.7 s


In [116]:
print(len(response.articles))
response.articles

29


[Article(title='[GM]AP 샤코 서폿 설명 길게 안함', url='manualToolView.php?idx=146545', season=14, published_at='2023-09-22'),
 Article(title='★(마스터) 시즌 완벽 적응 개사기 ..', url='manualToolView.php?idx=148044', season=14, published_at='2023-07-26'),
 Article(title='[GM1]프로 1군원딜들 피셜 근본 원딜..', url='', season=14, published_at='2024-03-16'),
 Article(title='M)시즌 5부터 딩거 한 유저의 공략', url='manualToolView.php?idx=148020', season=14, published_at='2024-02-05'),
 Article(title="'장인초대석 2회 출연 및 가렌 평점..", url='manualToolView.php?idx=148003', season=13, published_at='2024-01-09'),
 Article(title='[M1] 30대중반도 마스터가는잭스공략', url='manualToolView.php?idx=147904', season=13, published_at='2024-01-07'),
 Article(title='M300) 트페 공략', url='', season=13, published_at='2023-12-30'),
 Article(title='M1 ) 시즌말에 쓰는 트린다미어 공략', url='', season=13, published_at='2023-10-21'),
 Article(title='케이틀린', url='manualToolView.php?idx=147527', season=13, published_at='2023-10-06'),
 Article(title='선제공격 (원형낫 카직스)', url='manualToolView.php?idx=14662