# Crawl web page and extract information

- 웹페이지 주소를 입력해서 텍스트 데이터만 추출
- 텍스트 데이터에서 LLM 을 이용하여 정보를 추출
- 다른 작업에 사용할 수 있도록 jsonl 형태로 내보내기

---

# 0. Setup

In [1]:
!pip -q install -U requests beautifulsoup4 trafilatura tiktoken langchain

In [46]:
import json
import textwrap
from pathlib import Path
from datetime import datetime

# web scraping
import requests
import trafilatura
from bs4 import BeautifulSoup

# sagemaker inference
import boto3
from sagemaker.session import Session
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# splitter
import tiktoken
from langchain.text_splitter import TokenTextSplitter

In [139]:
urls = [
    'https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/acl.html',
    'https://www.imdb.com/title/tt3748528/plotsummary/',
    'https://starwars.fandom.com/wiki/Andor_(television_series)',
]
len(urls)

3

In [96]:
def fallback_parse(response_content):
    soup = BeautifulSoup(response_content, 'html.parser')
    text = soup.find_all(string=True)
    cleaned_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)
            
    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()

In [97]:
def parse(url):    
    print(f'parse url: {url}...')
    downloaded = trafilatura.fetch_url(url)

    contents = trafilatura.extract(
        downloaded, output_format="json",
        include_comments=False, include_links=False, with_metadata=True,
        date_extraction_params={'extensive_search': True, 'original_date': True},
    )
    
    if contents:
        json_output = json.loads(contents)
        return json_output['text']
    else:
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return fallback_parse(resp.content)
            else:
                return None
        except Exception as e:
            print(e)
            raise e

# 1. Crawl url and extract texts

- trafilatura 를 이용하여 url 에서 텍스트만 추출

In [7]:
%%time

docs = []
for url in urls:
    doc = parse(url)
    print(f'len: {len(doc)}\n')
    if doc is None:
        print(f'failed to parse url: {url}')
        continue
    docs.append(doc)
    
print(f'num docs: {len(docs)}')

parse url: https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/acl.html...
len: 9100

parse url: https://www.imdb.com/title/tt3748528/plotsummary/...
len: 16040

parse url: https://starwars.fandom.com/wiki/Andor_(television_series)...
len: 4801

num docs: 3
CPU times: user 1.55 s, sys: 63.1 ms, total: 1.61 s
Wall time: 4.32 s


In [24]:
for i, doc in enumerate(docs):
    text = textwrap.shorten(doc, width=70, placeholder=' ...\n')
    print(f'doc {i}:\n{text}')

doc 0:
Anti-corruption layer pattern Intent The anti-corruption layer ...

doc 1:
- In a time of conflict, a group of unlikely heroes band together ...

doc 2:
- "We've all done terrible things on behalf of the Rebellion. ...



# 2. Extract information

- [Sagemaker Jumpstart](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/) 를 이용하여 Llama2 배포
- Llama2 를 이용하여 정보를 추출한다

---

## Sagemaker Endpoint 설정

- Llama2 70B chat 모델을 us-east-1 리전에 배포했다고 가정한다.

In [98]:
profile_name = None
region = 'us-east-1'

In [99]:
boto_session = boto3.Session(
    profile_name=profile_name,
    region_name=region,
)
smclient = boto3.client('sagemaker')
smsession = Session(boto_session=boto_session, sagemaker_client=smclient)

In [100]:
endpoint_name = 'jumpstart-dft-meta-textgeneration-llama-2-70b-f'

In [101]:
predictor = Predictor(
    sagemaker_session=smsession,
    serializer=JSONSerializer(content_type='application/json'),
    deserializer = JSONDeserializer(accept='application/json'),
    endpoint_name=endpoint_name,
)

- 청크로 나누고 정보를 추출한다.
- 프롬프트에서 JSON 형태로 결과를 출력하도록 유도한다.
- 누락되면 안되는 정보를 누락하지 않도록, Chain of thought 방식으로 표현할 정보를 조정한다.

In [130]:
system_prompt = """
Extract relevant information from the user text to build a topic model. \
The user text is enclosed in tripple backticks (```).
Only output the JSON object, with nothing else.

Follow these steps to extract information from the user text.

Step 1: List informative keywords that helps to understand the text.

Step 2: If the text contains informative name of entities, List them.

Step 3: Provide summary of the text in 50 words. \
The summary should containing as many extracted informations in the previous step as possible. \
The information must not contain any code. \
Do not provide any sample code in the information.

Provide response as a JSON object with the following schema:
{"keywords": <step 1 reasoning>, "entities": <step 2 reasoning>, "summary": <step 3 reasoning>}
""".strip()

def _chunk_inputs(chunk):
    return [[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f'```{chunk}```'},
    ]]

def extract_info(chunk):
    payload = {
        "inputs": _chunk_inputs(chunk),
        "parameters": {
            "max_new_tokens": 1024,
            "top_p": 0.9,
            "temperature": 0.01,
        }
    }
    response = predictor.predict(payload, custom_attributes='accept_eula=true')
    resp_text = response[0]['generation']['content']
    try:
        return json.loads(resp_text)
    except:
        print(resp_text)
        return None

In [133]:
%%time

model_name = 'gpt-3.5-turbo'

doc_infos = []
for doc_id, doc in enumerate(docs):
    # try to split into ~6 chunks
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(doc))
    chunk_size = max(512, num_tokens//6)
    print(f'start extracting info from doc_id: {doc_id}, length: {len(doc)}, num_tokens: {num_tokens}, chunk_size: {chunk_size}')
    
    splitter = TokenTextSplitter.from_tiktoken_encoder(
        model_name=model_name,
        chunk_size=chunk_size,
        chunk_overlap=20,
    )
    each_info = []
    for chunk_idx, chunk in enumerate(splitter.split_text(doc)):
        print(f'chunk-{chunk_idx}({len(chunk)}), ', end='')
        info = extract_info(chunk)
        if info is None:
            print(f'invalid output. ', end='')
        else:
            each_info.append(info)
    print(f'chunks: {len(each_info)}...')
    
    doc_infos.append(each_info)

start extracting info from doc_id: 0, length: 9100, num_tokens: 1723, chunk_size: 512
chunk-0(2810), chunk-1(2810), chunk-2(2514), chunk-3(1257), chunks: 4...
start extracting info from doc_id: 1, length: 16040, num_tokens: 3695, chunk_size: 615
chunk-0(2610), chunk-1(2538), chunk-2(2590), chunk-3(2759), chunk-4(2756), chunk-5(2777), chunk-6(530), chunks: 7...
start extracting info from doc_id: 2, length: 4801, num_tokens: 1098, chunk_size: 512
chunk-0(2334), chunk-1(2205), chunk-2(412), chunks: 3...
CPU times: user 189 ms, sys: 25.3 ms, total: 215 ms
Wall time: 3min 21s


# 3. Save to file

- 각 청크단위의 정보를 하나로 합쳐셔 저장

In [142]:
now = datetime.now().strftime('%Y%m%dT%H%M')
with open(f'./dataset-{now}.jsonl', 'w') as fp:
    for idx, doc_info in enumerate(doc_infos):
        keywords = []
        entities = []
        summary = []
        for el in doc_info:
            keywords.extend(el['keywords'])
            entities.extend(el['entities'])
            summary.append(el['summary'])
        D = {
            'keywords': list(set(keywords)),
            'entities': list(set(entities)),
            'summary': '\n'.join(summary),
            'url': urls[idx],
        }
        fp.write(f'{json.dumps(D)}\n')