This notebook tests which pages from SWDE are available in archive.org. We could then use those because they have also their CSS archived.

In [None]:
import importlib
import json
import dataclasses
import collections

from awe.data import swde, wayback, constants
from awe import awe_graph, utils
for module in [wayback, utils]:
    importlib.reload(module)

In [None]:
# Try only subset for now.
pages = [page
    for vertical in swde.VERTICALS
    for website in vertical.websites
    for page in website.pages[:50]]
len(pages)

In [None]:
SWDE_TIMESTAMP = '20110601000000' # SWDE was released in 2011
WAYBACK_DATA_PATH = f'{constants.DATA_DIR}/wayback.json'

In [None]:
# Load saved responses.
with open(WAYBACK_DATA_PATH, mode='r', encoding='utf-8') as f:
    data_dict = json.loads(f.read())
already_loaded = 0
newly_loaded = 0
newly_skipped = 0
not_loaded = 0
for page in pages:
    if page.archived is False:
        response = data_dict.get(page.url, False)
        if response is None:
            # `null` value saved in the JSON means the page was fetched but the
            # API returned no results.
            page.archived = None
            newly_skipped += 1
        elif response is False:
            # This means the URL is not contained in the JSON.
            not_loaded += 1
        else:
            page.archived = wayback.WaybackPage(page.url, **response)
            newly_loaded += 1
    else:
        already_loaded += 1
already_loaded, newly_loaded, newly_skipped, not_loaded, len(data_dict)

In [None]:
# Fetch data from API.
RETRY_FAILED = False
PARALLELIZE = -1
def filter_page(page: awe_graph.HtmlPage):
    return page.archived is False or (page.archived is None and RETRY_FAILED)
def fetch_page(page: awe_graph.HtmlPage):
    assert not page.archived
    page.archived = wayback.WaybackPage.get(page.url, SWDE_TIMESTAMP)
pages_to_fetch = list(filter(filter_page, pages))
utils.parallelize(PARALLELIZE, fetch_page, pages_to_fetch, 'pages')

In [None]:
# Run some stats on data.
skipped = 0
no_snapshot = 0
bad_status = 0
total = 0
for page in pages:
    if page.archived is False:
        skipped += 1
    elif page.archived is None:
        no_snapshot += 1
    elif page.archived.status != 200:
        bad_status += 1
    total += 1
skipped, no_snapshot, bad_status, total

In [None]:
# Which websites failed most?
failed_websites = collections.defaultdict(int)
for page in pages:
    if page.archived is None:
        failed_websites[page.site.dir_name] += 1
list(sorted(failed_websites.items(), key=lambda i: i[1], reverse=True))

In [None]:
# Store obtained WaybackMachine API responses in a file.
def serialize_page(page: awe_graph.HtmlPage):
    if page.archived is None:
        return None
    d = dataclasses.asdict(page.archived)
    del d['original_url']
    return d
data_dict = data_dict | {
    page.url: serialize_page(page)
    for page in pages
    if page.archived is not False
}
with open(WAYBACK_DATA_PATH, mode='w', encoding='utf-8') as f:
    f.write(json.dumps(data_dict, indent=2))
len(data_dict), WAYBACK_DATA_PATH