# Crawl web page and extract information

- 웹페이지 주소를 입력해서 텍스트 데이터만 추출
- 텍스트 데이터에서 LLM 을 이용하여 정보를 추출
- 다른 작업에 사용할 수 있도록 jsonl 형태로 내보내기

---

# 0. Setup

In [3]:
!pip -q install -U boto3 awscli requests beautifulsoup4 trafilatura langchain

In [7]:
import json
import textwrap
from pathlib import Path
from datetime import datetime

# web scraping
import requests
import trafilatura
from bs4 import BeautifulSoup

# bedrock
import json
import boto3

# splitter
import tiktoken
from langchain.text_splitter import TokenTextSplitter

In [8]:
urls = [
#    'https://www.imdb.com/title/tt0111161/plotsummary/',
#    'https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/acl.html',
    'https://en.wikipedia.org/wiki/Lee_Byung-hun',
]
len(urls)

1

In [9]:
def fallback_parse(response_content):
    soup = BeautifulSoup(response_content, 'html.parser')
    text = soup.find_all(string=True)
    cleaned_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)
            
    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()

In [10]:
def parse(url):    
    print(f'parse url: {url}...')
    downloaded = trafilatura.fetch_url(url)

    contents = trafilatura.extract(
        downloaded, output_format="json",
        include_comments=False, include_links=False, with_metadata=True,
        date_extraction_params={'extensive_search': True, 'original_date': True},
    )
    
    if contents:
        json_output = json.loads(contents)
        return json_output['text']
    else:
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return fallback_parse(resp.content)
            else:
                return None
        except Exception as e:
            print(e)
            raise e

# 1. Crawl url and extract texts

- trafilatura 를 이용하여 url 에서 텍스트만 추출

In [11]:
%%time

docs = []
for url in urls:
    doc = parse(url)
    print(f'len: {len(doc)}\n')
    if doc is None:
        print(f'failed to parse url: {url}')
        continue
    docs.append(doc)
    
print(f'num docs: {len(docs)}')

parse url: https://en.wikipedia.org/wiki/Lee_Byung-hun...
len: 37667

num docs: 1
CPU times: user 328 ms, sys: 21.9 ms, total: 349 ms
Wall time: 811 ms


In [12]:
for i, doc in enumerate(docs):
    text = textwrap.shorten(doc, width=70, placeholder=' ...\n')
    print(f'doc {i}:\n{text}')

doc 0:
Lee Byung-hun Lee Byung-hun |Born |July 12, 1970 Seoul, South ...



In [13]:
docs

['Lee Byung-hun\nLee Byung-hun\n|Born\n|July 12, 1970\nSeoul, South Korea\n|Education\n|Hanyang University\n|Occupation\n|Actor\n|Years active\n|1991–present\n|Agents\n|Spouse\n|\n(m. 2013)\n|Children\n|1\n|Korean name\n|Hangul\n|Hanja\n|Revised Romanization\n|I Byeong(-)heon\n|McCune–Reischauer\n|Yi Pyŏnghŏn\nLee Byung-hun (Korean: 이병헌; born July 12, 1970[1]) is a South Korean actor. He has received critical acclaim for his work in a wide range of genres, most notably Joint Security Area (2000); A Bittersweet Life (2005); The Good, the Bad, the Weird (2008); I Saw the Devil (2010); Masquerade (2012); and the television series Iris (2009) and Mr. Sunshine (2018). His critically acclaimed film Inside Men (2015) won him the Best Actor prize in three prestigious award ceremonies: 52nd Baeksang Art Awards, 37th Blue Dragon Awards and 53rd Grand Bell Awards. Lee has seven films—Joint Security Area, The Good, the Bad, the Weird, Masquerade, Inside Men, Master, Ashfall and The Man Standing Ne

# 2. Extract information
- Bedrock 을 통해 Claude2 모델을 us-east-1 리전에서 호출한다.
- 해당 호출은 사용당 과금이 된다.

In [20]:
profile_name = None
region = 'us-east-1'

In [23]:
session = boto3.Session(
    profile_name=profile_name,
    region_name=region,
)
bedrock = session.client(service_name='bedrock-runtime')

In [24]:
modelId = 'anthropic.claude-v2'
accept = 'application/json'
contentType = 'application/json'

In [25]:
instruction_prompt = """
Extract relevant information from the user text to build a topic model. \
The user text is enclosed in text tags, <text></text>.
Only output the JSON object, with nothing else. Make sure JSON output is enclosed a braces ({}).

Follow these steps to extract information from the user text.

Step 1: List informative keywords that helps to understand the text.

Step 2: If the text contains informative name of entities, List them.

Step 3: Provide summary of the text in about 50 words. \
The summary should containing as many keywords and entities in the previous step as possible. \
The information must not contain any code. Do not provide any sample code in the information.

Provide response as a JSON object with the following schema:
{"keywords": <step 1 reasoning>, "entities": <step 2 reasoning>, "summary": <step 3 reasoning>}
""".strip()

def _build_prompt(chunk):
    user_prompt = f"""
Here is the user's text.

<text>{chunk}</text>

Please extract the informations.
    """.strip()
    return f"""\n\nHuman: {instruction_prompt}\n\nHuman: {user_prompt}\n\n Assistant: """

def extract_info(chunk):
    body = json.dumps({
        "prompt": _build_prompt(chunk),
        "max_tokens_to_sample": 2048,
        "top_p": 0.9,
        "temperature": 0.2,
    })
    resp = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    resp_text = resp.get('body').read()
    try:
        return json.loads(resp_text).get('completion')
    except Exception as e:
        print('error occured', e)
        print(resp_text)
        return None

In [26]:
%%time

model_name = 'gpt-3.5-turbo'

doc_infos = []
for doc_id, doc in enumerate(docs):
    # try to split into ~6 chunks
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(doc))
    chunk_size = num_tokens//8 if num_tokens > 512 else num_tokens
    print(f'start extracting info from doc_id: {doc_id}, length: {len(doc)}, num_tokens: {num_tokens}, chunk_size: {chunk_size}')
    
    splitter = TokenTextSplitter.from_tiktoken_encoder(
        model_name=model_name,
        chunk_size=chunk_size,
        chunk_overlap=20,
    )
    each_info = []
    for chunk_idx, chunk in enumerate(splitter.split_text(doc)):
        print(f'chunk-{chunk_idx}({len(chunk)}), ', end='')
        info = extract_info(chunk)
        if info is None:
            print(f'invalid output. ', end='')
        else:
            each_info.append(info)
    print(f'chunks: {len(each_info)}...')
    
    doc_infos.append(each_info)

start extracting info from doc_id: 0, length: 37667, num_tokens: 11572, chunk_size: 1446
chunk-0(5726), chunk-1(6024), chunk-2(5458), chunk-3(4603), chunk-4(4768), chunk-5(3939), chunk-6(3530), chunk-7(3380), chunk-8(695), chunks: 9...
CPU times: user 133 ms, sys: 21.4 ms, total: 154 ms
Wall time: 1min 41s


In [27]:
doc_infos[0][-1]

' Here is the extracted information from the user text in JSON format:\n\n{"keywords": ["Best Actor", "Paeksang Arts Award", "Blue Dragon Film Awards", "Grand Bell Awards", "New Actor", "1970 births", "Hanyang University alumni", "Living people", "Male actors from Seoul", "South Korean male film actors", "South Korean male television actors", "South Korean expatriates in the United States", "21st-century South Korean male actors","20th-century South Korean male actors", "Best Actor Asian Film Award winners", "South Korean male taekwondo practitioners", "Asia Pacific Screen Award winners", "Grand Prize Paeksang Arts Award (Film) winners"], "entities": ["Paeksang Arts Award", "Blue Dragon Film Awards", "Grand Bell Awards", "Hanyang University", "Seoul", "United States"], "summary": "The text discusses awards won by South Korean male actors in film and television, especially Best Actor awards from major Korean awards ceremonies like the Paeksang Arts Awards, Blue Dragon Film Awards, and G

# 3. Save to file

- 각 청크단위의 정보를 하나로 합쳐셔 저장

In [28]:
now = datetime.now().strftime('%Y%m%dT%H%M')
with open(f'./dataset-{now}.jsonl', 'w') as fp:
    for idx, doc_info in enumerate(doc_infos):
        keywords = []
        entities = []
        summary = []
        for el in doc_info:
            keywords.extend(el['keywords'])
            entities.extend(el['entities'])
            summary.append(el['summary'])
        D = {
            'keywords': list(set(keywords)),
            'entities': list(set(entities)),
            'summary': '\n'.join(summary),
            'url': urls[idx],
        }
        fp.write(f'{json.dumps(D)}\n')

TypeError: string indices must be integers, not 'str'