# Crawl web page and extract information

- 웹페이지 주소를 입력해서 텍스트 데이터만 추출
- 텍스트 데이터에서 LLM 을 이용하여 정보를 추출
- 다른 작업에 사용할 수 있도록 jsonl 형태로 내보내기

---

# 0. Setup

In [1]:
!pip -q install -U requests beautifulsoup4 trafilatura tiktoken langchain

In [17]:
import json
import textwrap
from pathlib import Path
from datetime import datetime

# web scraping
import requests
import trafilatura
from bs4 import BeautifulSoup

# sagemaker inference
import boto3
from sagemaker.session import Session
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# splitter
import tiktoken
from langchain.text_splitter import TokenTextSplitter

In [18]:
urls = [
#    'https://www.imdb.com/title/tt0111161/plotsummary/',
#    'https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/acl.html',
#    'https://starwars.fandom.com/wiki/Andor_(television_series)',
    'https://en.wikipedia.org/wiki/Lee_Byung-hun',
]
len(urls)

1

In [19]:
def fallback_parse(response_content):
    soup = BeautifulSoup(response_content, 'html.parser')
    text = soup.find_all(string=True)
    cleaned_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)
            
    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()

In [20]:
def parse(url):    
    print(f'parse url: {url}...')
    downloaded = trafilatura.fetch_url(url)

    contents = trafilatura.extract(
        downloaded, output_format="json",
        include_comments=False, include_links=False, with_metadata=True,
        date_extraction_params={'extensive_search': True, 'original_date': True},
    )
    
    if contents:
        json_output = json.loads(contents)
        return json_output['text']
    else:
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return fallback_parse(resp.content)
            else:
                return None
        except Exception as e:
            print(e)
            raise e

# 1. Crawl url and extract texts

- trafilatura 를 이용하여 url 에서 텍스트만 추출

In [21]:
%%time

docs = []
for url in urls:
    doc = parse(url)
    print(f'len: {len(doc)}\n')
    if doc is None:
        print(f'failed to parse url: {url}')
        continue
    docs.append(doc)
    
print(f'num docs: {len(docs)}')

parse url: https://en.wikipedia.org/wiki/Lee_Byung-hun...
len: 37411

num docs: 1
CPU times: user 261 ms, sys: 11.4 ms, total: 272 ms
Wall time: 388 ms


In [22]:
for i, doc in enumerate(docs):
    text = textwrap.shorten(doc, width=70, placeholder=' ...\n')
    print(f'doc {i}:\n{text}')

doc 0:
Lee Byung-hun Lee Byung-hun |Born||July 12, 1970| Seoul, South ...



In [23]:
docs

['Lee Byung-hun\nLee Byung-hun\n|Born||July 12, 1970|\nSeoul, South Korea\n|Education||Hanyang University|\n|Occupation||Actor|\n|Years active||1991–present|\n|Agents|\n|Spouse||\n|\n(m. 2013)\n|Children||1|\n|Korean name|\n|Hangul|\n|Hanja|\n|Revised Romanization||I Byeong(-)heon|\n|McCune–Reischauer||Yi Pyŏnghŏn|\nLee Byung-hun (Korean: 이병헌; born July 12, 1970[1]) is a South Korean actor. He has received critical acclaim for his work in a wide range of genres, most notably Joint Security Area (2000); A Bittersweet Life (2005); The Good, the Bad, the Weird (2008); I Saw the Devil (2010); Masquerade (2012); and the television series Iris (2009) and Mr. Sunshine (2018). His critically acclaimed film Inside Men (2015) won him the Best Actor prize in three prestigious award ceremonies: 52nd Baeksang Art Awards, 37th Blue Dragon Awards and 53rd Grand Bell Awards. Lee has seven films—Joint Security Area, The Good, the Bad, the Weird, Masquerade, Inside Men, Master, Ashfall and The Man Stand

# 2. Extract information

- [Sagemaker Jumpstart](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/) 를 이용하여 Llama2 배포
- Llama2 를 이용하여 정보를 추출한다

---

## Sagemaker Endpoint 설정

- Llama2 70B chat 모델을 us-east-1 리전에 배포했다고 가정한다.

In [24]:
profile_name = None
region = 'us-east-1'

In [25]:
boto_session = boto3.Session(
    profile_name=profile_name,
    region_name=region,
)
smclient = boto3.client('sagemaker')
smsession = Session(boto_session=boto_session, sagemaker_client=smclient)

In [26]:
endpoint_name = 'jumpstart-dft-meta-textgeneration-llama-2-70b-f'

In [27]:
predictor = Predictor(
    sagemaker_session=smsession,
    serializer=JSONSerializer(content_type='application/json'),
    deserializer = JSONDeserializer(accept='application/json'),
    endpoint_name=endpoint_name,
)

- 청크로 나누고 정보를 추출한다.
- 프롬프트에서 JSON 형태로 결과를 출력하도록 유도한다.
- 누락되면 안되는 정보를 누락하지 않도록, Chain of thought 방식으로 표현할 정보를 조정한다.

In [31]:
system_prompt = """
Extract relevant information from the user text to build a topic model. \
The user text is enclosed in tripple backticks (```).
Only output the JSON object, with nothing else. Make sure JSON output is enclosed a braces ({}).

Follow these steps to extract information from the user text.

Step 1: List informative keywords that helps to understand the text.

Step 2: If the text contains informative name of entities, List them.

Step 3: Provide summary of the text in about 50 words. \
The summary should containing as many keywords and entities in the previous step as possible. \
The information must not contain any code. Do not provide any sample code in the information.

Provide response as a JSON object with the following schema:
{"keywords": <step 1 reasoning>, "entities": <step 2 reasoning>, "summary": <step 3 reasoning>}
""".strip()

def _chunk_inputs(chunk):
    return [[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f'```{chunk}```'},
    ]]

def extract_info(chunk):
    payload = {
        "inputs": _chunk_inputs(chunk),
        "parameters": {
            "max_new_tokens": 2048,
            "top_p": 0.9,
            "temperature": 0.01,
        }
    }
    response = predictor.predict(payload, custom_attributes='accept_eula=true')
    resp_text = response[0]['generation']['content']
    try:
        return json.loads(resp_text)
    except:
        print(resp_text)
        return None

In [32]:
%%time

model_name = 'gpt-3.5-turbo'

doc_infos = []
for doc_id, doc in enumerate(docs):
    # try to split into ~6 chunks
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(doc))
    chunk_size = num_tokens//8 if num_tokens > 512 else num_tokens
    print(f'start extracting info from doc_id: {doc_id}, length: {len(doc)}, num_tokens: {num_tokens}, chunk_size: {chunk_size}')
    
    splitter = TokenTextSplitter.from_tiktoken_encoder(
        model_name=model_name,
        chunk_size=chunk_size,
        chunk_overlap=20,
    )
    each_info = []
    for chunk_idx, chunk in enumerate(splitter.split_text(doc)):
        print(f'chunk-{chunk_idx}({len(chunk)}), ', end='')
        info = extract_info(chunk)
        if info is None:
            print(f'invalid output. ', end='')
        else:
            each_info.append(info)
    print(f'chunks: {len(each_info)}...')
    
    doc_infos.append(each_info)

start extracting info from doc_id: 0, length: 37411, num_tokens: 11492, chunk_size: 1436
chunk-0(5734), chunk-1(5984), chunk-2(5447), chunk-3(4579), chunk-4(4741), chunk-5(3894), chunk-6(3493), chunk-7(3367), chunk-8(695), chunks: 9...
CPU times: user 144 ms, sys: 19.2 ms, total: 163 ms
Wall time: 2min 56s


In [33]:
doc_infos[0][-1]

{'keywords': ['Paeksang Arts Award',
  'Best Actor',
  'Blue Dragon Film Awards',
  'Grand Bell Awards',
  'Hanyang University',
  'Living people',
  'Male actors',
  'Seoul',
  'South Korean',
  'film actors',
  'television actors',
  'expatriates',
  'United States',
  '21st-century',
  '20th-century',
  'Asian Film Award',
  'taekwondo practitioners',
  'Asia Pacific Screen Award',
  'Grand Prize Paeksang Arts Award'],
 'entities': ['HanCinema', 'South Korea'],
 'summary': 'HanCinema is a South Korean actor who has won various awards for his performances in film and television, including Best Actor at the Paeksang Arts Awards, Blue Dragon Film Awards, and Grand Bell Awards. He has also won awards for his work in the 21st and 20th centuries and is known for his taekwondo skills.'}

# 3. Save to file

- 각 청크단위의 정보를 하나로 합쳐셔 저장

In [34]:
now = datetime.now().strftime('%Y%m%dT%H%M')
with open(f'./dataset-{now}.jsonl', 'w') as fp:
    for idx, doc_info in enumerate(doc_infos):
        keywords = []
        entities = []
        summary = []
        for el in doc_info:
            keywords.extend(el['keywords'])
            entities.extend(el['entities'])
            summary.append(el['summary'])
        D = {
            'keywords': list(set(keywords)),
            'entities': list(set(entities)),
            'summary': '\n'.join(summary),
            'url': urls[idx],
        }
        fp.write(f'{json.dumps(D)}\n')