# Web scraping tests

A lot of overall program time is taken in sequential web scraping. If I could parallelize it, that would help tremendously. Similarly, a fair amount of time is spent in content extraction.

To test:
- Compare the content extractor I'm currently using vs Scrapfly's content extractor on some webpages
- Compare the rough time it'd take to parallel scrape "manually" vs scrapfly at max concurrency
- Estimate costs with scrapfly

In [1]:
from core import Seed, init

# Comment out to reduce caching
# init()

In [2]:
target = Seed.init("98point6")

from data_sources.news.search import find_news_articles
search_results = find_news_articles(target, num_results=10)
search_results

[32m2024-11-18 11:33:42.642[0m | [34m[1mDEBUG   [0m | [36mutils.google_search[0m:[36msearch[0m:[36m62[0m - [34m[1mGoogle search results: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, 'queries': {'request': [{'title': 'Google Custom Search - "98point6" related:None news -site:reddit.com -site

[SearchResult(title='Members eliminate medications and see significant, sustained ...', link='https://www.banneraetna.com/en/about-us/news/type-2-diabetes-reversal-program-through-virta-health.html', snippet='Aug 28, 2024 ... CVS Health is the leading health solutions company, delivering care like no one else can. ... 98point6® is a registered trademark of 98point6 Inc. Access to\xa0...', formattedUrl='https://www.banneraetna.com/.../news/type-2-diabetes-reversal-program-thr...'),
 SearchResult(title='2023-esg-report.pdf', link='https://www.walgreensbootsalliance.com/sites/www/files/asset/2023-esg-report.pdf', snippet='Jan 24, 2024 ... be related to climate change, sustainability and other ESG-related matters. ... by Transcarent 98point6 and expands access to virtual primary care doctors.', formattedUrl='https://www.walgreensbootsalliance.com/sites/www/.../2023-esg-report.pdf'),
 SearchResult(title='Joanne Mizell Named Among 2024 Outstanding Women in ...', link='https://www.banneraetna

In [3]:
# efficiency test harness
import time

def test_scrape(urls, scrape_fn):
    start = time.time()
    result = scrape_fn(urls)

    duration_sec = time.time() - start
    sec_per_url = duration_sec / len(urls)
    urls_per_sec = len(urls) / duration_sec

    print(f"{scrape_fn.__name__}: {sec_per_url:.1f} sec per URL, {urls_per_sec:,.0f} URLs/s ({duration_sec:.1f} sec total)")

    return result

async def test_ascrape(urls, scrape_fn):
    start = time.time()
    result = await scrape_fn(urls)

    duration_sec = time.time() - start
    sec_per_url = duration_sec / len(urls)
    urls_per_sec = len(urls) / duration_sec

    print(f"{scrape_fn.__name__}: {sec_per_url:.1f} sec per URL, {urls_per_sec:,.0f} URLs/s ({duration_sec:.1f} sec total)")

    return result

def dummy_fn(urls):
    return urls

async def async_dummy_fn(urls):
    return urls


dummy_result = test_scrape(search_results, dummy_fn)
dummy_async_result = await test_ascrape(search_results, async_dummy_fn)


dummy_fn: 0.0 sec per URL, 41,943,040 URLs/s (0.0 sec total)
async_dummy_fn: 0.0 sec per URL, 41,943,040 URLs/s (0.0 sec total)


In [4]:
from utils.scrape import request_article

def baseline_scrape(urls):
    responses = [request_article(result.link) for result in urls]

    # Show status codes
    for response in responses:
        print(response.url, response.status_code, type(response))

    return responses

# baseline_responses = test_scrape(search_results, baseline_scrape)


In [5]:
# import asyncio
# from scrapfly import ScrapeApiResponse, ScrapeConfig, ScrapflyClient, ScrapflyScrapeError, ScrapflyError
# from scrapfly.scrape_config import Format, FormatOption
# import os

# config = {
#     # "asp": True,
#     "country": "us",
#     # "proxy_pool": "public_residential_pool",
#     "retry": False,
#     "cache": True,
#     "format": Format.MARKDOWN,
#     "format_options": [FormatOption.NO_IMAGES, FormatOption.NO_LINKS],
#     "raise_on_upstream_error": False,
#     # TO CONSIDER
#     # session = value (this will reuse the same machine for subsequent requests due to sticky_proxy, but disables caching)
#     # render_js = True (this will render the page with a headless browser and might help sometimes)
# }

# async def safe_scrape(client, url):
#     try:
#         return await client.async_scrape(ScrapeConfig(url=url, **config))
#     except Exception as e:
#         # If we don't do this, the error propagates to the Jupyter kernel and crashes the whole thing
#         return e

# async def scrapfly_scrape(urls):
#     client = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"], max_concurrency=4)

#     futures = [safe_scrape(client, result.link) for result in urls]
#     responses = await asyncio.gather(*futures)

#     return responses

# scrapfly_responses = await test_ascrape(search_results, scrapfly_scrape)

# # pick an example one to show
# scrapfly_responses[0].content

In [6]:
from typing import List, Dict
from collections import defaultdict
from urllib.parse import urlparse

def group_urls_by_domain(urls: List[str]) -> Dict[str, List[str]]:
    grouped_urls = defaultdict(list)
    for url in urls:
        domain = urlparse(url).netloc
        grouped_urls[domain].append(url)
    return grouped_urls

group_urls_by_domain([sr.link for sr in search_results])

defaultdict(list,
            {'www.banneraetna.com': ['https://www.banneraetna.com/en/about-us/news/type-2-diabetes-reversal-program-through-virta-health.html',
              'https://www.banneraetna.com/en/about-us/news/joanne-mizell-2024-outstanding-women-in-business.html'],
             'www.walgreensbootsalliance.com': ['https://www.walgreensbootsalliance.com/sites/www/files/asset/2023-esg-report.pdf'],
             'www.businesswire.com': ['https://www.businesswire.com/news/home/20240515660756/en/Transcarent-Introduces-WayFinding-an-AI-Powered-Consumer-Health-Experience',
              'https://www.businesswire.com/news/home/20240502320693/en/Transcarent-Raises-126-Million-Series-D'],
             'telehealthandmedicinetoday.com': ['https://telehealthandmedicinetoday.com/index.php/journal/article/view/515/1183'],
             'www.mobihealthnews.com': ['https://www.mobihealthnews.com/news/counsel-health-secures-11m-seed-funding-led-a16z',
              'https://www.mobihealthnews

In [8]:
from itertools import chain
import asyncio
from typing import List, Dict, Tuple
import aiohttp
import newspaper
from utils.scrape import remove_img_tags, article_to_markdown

# async def response_to_article(
#     response,
# ) -> newspaper.Article:
#     """Parse the response from a URL into a newspaper Article"""
#     article = newspaper.article(
#         response.url.absolute,
#         language="en",
#         # Remove images to prevent downloading them (the downloads sometimes crash, and they slow things down)
#         input_html=remove_img_tags(await response.text()),
#         fetch_images=False,
#     )
#     article.parse()
#     return article

# async def fetch_articles(urls: List[str], headers: Dict[str, str]) -> List[str]:
#     articles = []
#     async with aiohttp.ClientSession(headers=headers) as session:
#         for url in urls:
#             async with session.get(url) as response:
#                 if response.ok:
#                     article = newspaper.article(
#                         url,
#                         language="en",
#                         # Remove images to prevent downloading them (the downloads sometimes crash, and they slow things down)
#                         input_html=remove_img_tags(await response.text()),
#                         fetch_images=False,
#                     )
#                     article.parse()
#                                     # article = await response_to_article(response)
#                     markdown = article_to_markdown(article)
#                     articles.append(markdown)

#     return articles

async def aio_get_urls(session: aiohttp.ClientSession, urls: List[str]) -> List[Tuple]:
    responses = []
    for i, url in enumerate(sorted(urls)):
        # if i > 0:
        #     # small sleep between requests to be nice
        #     await asyncio.sleep(0.2)

        # async with session.get(url) as response:

        print(f"Fetching {url}")

        try:
            async with session.get(url, allow_redirects=False) as response:
                print(f"...got {url}")
                if response.ok:
                    try:
                        responses.append((url, response.status, await response.text()))
                    except UnicodeDecodeError:
                        responses.append((url, response.status, None))
                else:
                    responses.append((url, response.status, None))
        except TimeoutError:
            responses.append((url, 504, None))

    return responses

# Example headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

async def aio_scrape(search_results: List) -> List[str]:
    # Group URLs by domain
    grouped_urls = group_urls_by_domain([sr.link for sr in search_results])
    
    connector = aiohttp.TCPConnector(limit=10, limit_per_host=1)
    async with aiohttp.ClientSession(connector=connector, headers=headers, timeout=aiohttp.ClientTimeout(total=2)) as session:
        futures = [aio_get_urls(session, urls) for urls in grouped_urls.values()]
    
        domain_responses = await asyncio.gather(*futures)

    return list(chain(*domain_responses))

# aio_responses = await test_ascrape(search_results, aio_scrape)

aio_responses = await aio_scrape(search_results[:10])

print(aio_responses)


Fetching https://www.banneraetna.com/en/about-us/news/joanne-mizell-2024-outstanding-women-in-business.html
Fetching https://www.walgreensbootsalliance.com/sites/www/files/asset/2023-esg-report.pdf
Fetching https://www.businesswire.com/news/home/20240502320693/en/Transcarent-Raises-126-Million-Series-D
Fetching https://telehealthandmedicinetoday.com/index.php/journal/article/view/515/1183
Fetching https://www.mobihealthnews.com/news/counsel-health-secures-11m-seed-funding-led-a16z
Fetching https://www.lcatterton.com/Press.html#!/LC-LotusTech
Fetching https://telecareaware.com/2024/05/
...got https://www.mobihealthnews.com/news/counsel-health-secures-11m-seed-funding-led-a16z
Fetching https://www.mobihealthnews.com/news/transcarent-unveils-ai-enabled-consumer-platform-care-navigation
...got https://www.mobihealthnews.com/news/transcarent-unveils-ai-enabled-consumer-platform-care-navigation
...got https://www.banneraetna.com/en/about-us/news/joanne-mizell-2024-outstanding-women-in-busine

In [None]:
search_results

[SearchResult(title='Members eliminate medications and see significant, sustained ...', link='https://www.banneraetna.com/en/about-us/news/type-2-diabetes-reversal-program-through-virta-health.html', snippet='Aug 28, 2024 ... CVS Health is the leading health solutions company, delivering care like no one else can. ... 98point6® is a registered trademark of 98point6 Inc. Access to\xa0...', formattedUrl='https://www.banneraetna.com/.../news/type-2-diabetes-reversal-program-thr...'),
 SearchResult(title='2023-esg-report.pdf', link='https://www.walgreensbootsalliance.com/sites/www/files/asset/2023-esg-report.pdf', snippet='Jan 24, 2024 ... be related to climate change, sustainability and other ESG-related matters. ... by Transcarent 98point6 and expands access to virtual primary care doctors.', formattedUrl='https://www.walgreensbootsalliance.com/sites/www/.../2023-esg-report.pdf'),
 SearchResult(title='Joanne Mizell Named Among 2024 Outstanding Women in ...', link='https://www.banneraetna