# Web scraping tests

A lot of overall program time is taken in sequential web scraping. If I could parallelize it, that would help tremendously. Similarly, a fair amount of time is spent in content extraction.

To test:
- Compare the content extractor I'm currently using vs Scrapfly's content extractor on some webpages
- Compare the rough time it'd take to parallel scrape "manually" vs scrapfly at max concurrency
- Estimate costs with scrapfly

In [None]:
from core import Seed, init

# Comment out to reduce caching
# init()

In [1]:
from stored_config import Company

target = Company.find_first("98point6").to_core_company()
target

Seed(company='98point6', deprecated_product='98point6', domain='98point6.com', keywords=None, deprecated_require_news_backlinks=False, deprecated_require_reddit_backlinks=False, primary_product=None, feature_flags=FeatureFlags(require_news_backlinks=False, require_reddit_backlinks=False))

In [None]:
from data_sources.news.search import find_news_articles
search_results = find_news_articles(target, num_results=40)
urls = [result.link for result in search_results]
search_results

In [None]:
# efficiency test harness
import time

def test_scrape(urls, scrape_fn):
    start = time.time()
    result = scrape_fn(urls)

    duration_sec = time.time() - start
    sec_per_url = duration_sec / len(urls)
    urls_per_sec = len(urls) / duration_sec

    print(f"{scrape_fn.__name__}: {sec_per_url:.1f} sec per URL, {urls_per_sec:,.0f} URLs/s ({duration_sec:.1f} sec total)")

    return result

async def test_ascrape(urls, scrape_fn, **kwargs):
    start = time.time()
    result = await scrape_fn(urls, **kwargs)

    duration_sec = time.time() - start
    sec_per_url = duration_sec / len(urls)
    urls_per_sec = len(urls) / duration_sec

    print(f"{scrape_fn.__name__}: {sec_per_url:.1f} sec per URL, {urls_per_sec:,.0f} URLs/s ({duration_sec:.1f} sec total)")

    return result

def dummy_fn(urls):
    return urls

async def async_dummy_fn(urls):
    return urls


dummy_result = test_scrape(search_results, dummy_fn)
dummy_async_result = await test_ascrape(search_results, async_dummy_fn)


In [None]:
from typing import List
from utils.scrape import request_article

def serial_scrape(urls: List[str]):
    # Note: For testing, don't run core.init() before so that we don't get a full cache
    responses = [request_article(url) for url in urls]

    # Show status codes
    for response in responses:
        print(response.url, response.status_code, type(response))

    return responses

# baseline_responses = test_scrape(urls, serial_scrape)


In [None]:
from utils.async_scrape import scrape
from core import cache

async_responses = await test_ascrape(urls, scrape, cache=cache)
async_responses

In [None]:
from collections import Counter

Counter([response.status for response in async_responses])

In [None]:
Counter([response.status_code for response in baseline_responses])
# len(baseline_responses), len(results)

In [None]:
sorted(urls)

In [None]:
from core import cache

for key in cache.iterkeys():
    print(key)

In [None]:
len(cache)