This notebook tests which pages from SWDE are available in archive.org. We could then use those because they have also their CSS archived.

In [None]:
import importlib
import json
import dataclasses

from tqdm.auto import tqdm

from awe.data import swde, wayback, constants
from awe import awe_graph
for module in [wayback]:
    importlib.reload(module)

In [None]:
# Try only subset for now.
vertical = swde.VERTICALS[0]
website = vertical.websites[0]
pages = website.pages[:10]

In [None]:
SWDE_TIMESTAMP = '20110601000000' # SWDE was released in 2011
WAYBACK_DATA_PATH = f'{constants.DATA_DIR}/wayback.json'

In [None]:
# Load saved responses.
with open(WAYBACK_DATA_PATH, mode='r', encoding='utf-8') as f:
    data_dict = json.loads(f.read())
already_loaded = 0
newly_loaded = 0
not_loaded = 0
for page in pages:
    if page.archived is False:
        response = data_dict.get(page.url)
        if response is not None:
            page.archived = wayback.WaybackPage(page.url, **response)
            newly_loaded += 1
    elif page.archived is not None:
        already_loaded += 1
    else:
        not_loaded += 1
(already_loaded, newly_loaded, not_loaded)

In [None]:
for page in tqdm(pages, desc='pages'):
    if page.archived is False:
        page.archived = wayback.WaybackPage.get(page.url, SWDE_TIMESTAMP)

In [None]:
no_snapshot = 0
bad_status = 0
total = 0
for page in pages:
    if page.archived is False:
        break
    elif page.archived is None:
        no_snapshot += 1
    elif page.archived.status != 200:
        bad_status += 1
    total += 1
no_snapshot, bad_status, total

In [None]:
# Store obtained WaybackMachine API responses in a file.
def serialize_page(page: awe_graph.HtmlPage):
    if page.archived is None:
        return None
    d = dataclasses.asdict(page.archived)
    del d['original_url']
    return d
data_dict = {
    page.url: serialize_page(page)
    for page in pages
    if page.archived is not False
}
with open(WAYBACK_DATA_PATH, mode='w', encoding='utf-8') as f:
    f.write(json.dumps(data_dict, indent=2))
len(data_dict), WAYBACK_DATA_PATH