# Baseline Scraper
- 웹페이지 주소를 입력해서 텍스트만 추출한다.

In [11]:
!pip -q install -U requests playwright

In [12]:
!playwright install

In [11]:
import os
import json

# web scraping
import requests
from playwright.async_api import async_playwright

from bs4 import BeautifulSoup

In [12]:
# 차단할 리소스 타입 정의
BLOCK_RESOURCE_TYPES = [
    'beacon',
    'csp_report', 
    'font',
    'image',
    'imageset',
    'media',
    'object',
    'texttrack',
]

# 차단할 트래커 및 광고 도메인 정의 
BLOCK_RESOURCE_NAMES = [
    'adzerk',
    'analytics',
    'cdn.api.twitter',
    'doubleclick',
    'exelator', 
    'facebook',
    'fontawesome',
    'google',
    'google-analytics',
    'googletagmanager',
    'adform',
]


async def intercept_route(route):
    """요청을 가로채서 차단된 리소스는 중단"""
    if route.request.resource_type in BLOCK_RESOURCE_TYPES:
        print(f'리소스 차단: {route.request.url} (타입: {route.request.resource_type})')
        return await route.abort()
    
    if any(tracker in route.request.url for tracker in BLOCK_RESOURCE_NAMES):
        print(f'트래커 차단: {route.request.url}')
        return await route.abort()
    
    return await route.continue_()

In [18]:
async def scrape(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await page.route("**/*", intercept_route)
        await page.goto(url)
        print(await page.content())
        await browser.close()

In [19]:
url = "https://lol.inven.co.kr/dataninfo/champion/manualTool.php?confirm=2"

In [20]:
await scrape(url)

리소스 차단: https://upload3.inven.co.kr/upload/2023/08/18/bbs/i8268349154.png (타입: image)
리소스 차단: https://static.inven.co.kr/image_2011/cafe/common/btn_and01.png (타입: image)
리소스 차단: https://static.inven.co.kr/image_2011/cafe/common/btn_ios01.png (타입: image)
리소스 차단: https://upload2.inven.co.kr/upload/2017/06/29/bbs/i16069371144.jpg (타입: image)
리소스 차단: https://upload2.inven.co.kr/upload/2019/03/07/bbs/i13588389197.jpg (타입: image)
리소스 차단: https://upload2.inven.co.kr/upload/2019/03/07/bbs/i14021823717.jpg (타입: image)
리소스 차단: https://upload2.inven.co.kr/upload/2017/07/11/bbs/i15602285282.jpg (타입: image)
리소스 차단: https://upload2.inven.co.kr/upload/2017/06/29/bbs/i14060662332.jpg (타입: image)
리소스 차단: https://upload3.inven.co.kr/upload/2023/04/04/bbs/i8206687393.png (타입: image)
리소스 차단: https://upload2.inven.co.kr/upload/2018/07/11/bbs/i14426205952.jpg (타입: image)
리소스 차단: https://static.inven.co.kr/image_2011/site_image/lol/dataninfo/icon/champions/Garen_Square_0_2.jpg?v=240805a (타입: image)
리소스 차단: h

  if any(filename.endswith(s) for s in all_bytecode_suffixes):


리소스 차단: https://gold.contentsfeed.com/lb/sd/nw?c=vis&id=b8c9e9af8214f8998dbf8d1bd30672d&lb_data=eyJ1dSI6IiIsImFvaWQiOiIiLCJ1YSI6Ik1vemlsbGEvNS4wIChNYWNpbnRvc2g7IEludGVsIE1hYyBPUyBYIDEwXzE1XzcpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIEhlYWRsZXNzQ2hyb21lLzEzMS4wLjY3NzguMzMgU2FmYXJpLzUzNy4zNiIsImFpZCI6IjE4IiwiZG8iOiIiLCJhZG54cyI6IiIsImR0IjoiTG9MIOyxlO2UvOyWuCDqs7XrnrUgLSDrpqzqt7jsmKTruIzroIjsoITrk5wg7J2467KkIiwiY2F0ZSI6ImxvbF7rpqzqt7jsmKTruIzroIjsoITrk5wiLCJhcmlkIjoiIiwiYXJkdCI6IiIsIm9ndSI6IiIsImwiOiJodHRwczovL2xvbC5pbnZlbi5jby5rci9kYXRhbmluZm8vY2hhbXBpb24vbWFudWFsVG9vbC5waHA_Y29uZmlybT0yIiwiciI6IiIsInZ0IjoiIiwiZXRjIjoiIiwiYWRpZCI6IiIsImNvdW50cnlDb2RlIjoiS1IiLCJzS2V5d29yZCI6IuuhpOyduOuypCzroaQg7KCE7KCB6rKA7IOJLOuhpCDquLDroZ3si6QsZeyKpO2PrOy4oCDribTsiqQs7LGU7ZS87Ja4IOqzteuetSztlITroZzruYzrk5wsbG9s7J2467KkLOumrOq3uOyYpOu4jOugiOyghOuTnCxsZWFndWUgb2YgbGVnZW5kcyzroaQsbG9sLOuhpOqzteuetSIsIm1jYXRlIjoiIiwiYTFnaWQiOiIiLCJpbmYiOiIiLCJoYXNodCI6IiIsImNpZCI6IiIsInNpZCI6IiIsImx0IjoiMCJ9&ver