# Data exploration

In [None]:
import awe.utils
awe.utils.init_notebook()

In [None]:
import collections
import math
import statistics

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
import awe.data.parsing
import awe.data.set.apify
import awe.data.set.pages
import awe.data.set.swde
import awe.data.validation
import awe.data.visual.exploration
awe.utils.reload('awe.data')

## Load dataset

Choose between the first two or the last two cells in this section depending on whether you want to load the Apify or the SWDE dataset.

In [None]:
ds = awe.data.set.apify.Dataset(
    # only_label_keys=('name', 'price', 'category', 'images'),
    # convert=False,
    # convert_slim=True,
    # skip_without_visuals=True,
    # only_websites=('notinoEn',),
)

In [None]:
pd.DataFrame(p.row for p in ds.get_all_pages()[:10])

In [None]:
ds = awe.data.set.swde.Dataset(
    suffix='-exact',
    only_verticals=('auto',),
    # convert=False,
)

In [None]:
pd.DataFrame(p.to_row() for p in ds.get_all_pages()[:10])

## Label statistics

This section shows a table with number of labeled nodes in each website.

In [None]:
def get_label_stats(page: awe.data.set.pages.Page):
    page_labels = page.get_labels()
    return sum(
        (
            collections.Counter({
                k: len(page_labels.get_label_values(k)),
                #f'{k}_nodes': len(page_labels.get_labeled_nodes(k)),
            })
            for k in page_labels.label_keys
        ),
        collections.Counter()
    )

stats = [
    sum(
        (
            get_label_stats(p)
            for p in tqdm(w.pages, desc=w.name, disable=True)
            if w.page_count != 0
        ),
        collections.Counter()
    )
    for w in tqdm(ds.verticals[0].websites, desc='websites')
]
keys = { k for s in stats for k in s.keys() }

In [None]:
df = pd.DataFrame([
    {
        'website': w.name,
        'domain': w.get_domain(),
        'pages': w.page_count,
    }
    | {
        k: 0
        for k in keys
    } | {
        k: c #f'{c} ({c / w.page_count:.0%})'
        for k, c in s.items()
    }
    for w, s in zip(ds.verticals[0].websites, stats)
])
# Add totals.
df.loc['total'] = df.sum()
df.loc['total', ('website', 'domain')] = ''
# Sort columns by name.
df = df.reindex(
    sorted(
        df.columns,
        key=lambda n: f'_{n}' if n in ('website', 'domain', 'pages') else n
    ),
    axis=1
)
df

Here, the table is turned into LaTeX code to be used in the thesis (to avoid manual and error-prone filling).

In [None]:
# Generate LaTeX table.
if ds.name == 'apify':
    columns = ('name', 'price', 'category', 'images', 'shortDescription', 'longDescription', 'specification')
    display_cols = {
        'category': 'cat',
        'shortDescription': 'short',
        'longDescription': 'long',
        'specification': 'spec'
    }
    split_nv = False
else:
    columns = ('model', 'price', 'engine', 'fuel_economy')
    display_cols = {
        'fuel_economy': 'economy'
    }
    split_nv = True
if split_nv:
    print('&', end=' ')
else:
    print('website & pages', end=' ')
for col in columns:
    col = display_cols.get(col, col)
    print(f'& \\akcol{{{col}}}', end=' ')
print('\\\\')
if split_nv:
    print('website & pages', end=' ')
    for col in columns:
        print('& \\akvn', end=' ')
    print('\\\\')
print('\\midrule')
for w, s in zip(ds.verticals[0].websites, stats):
    name = w.get_domain() \
        .removeprefix('www.') \
        .removesuffix('.com') \
        .removesuffix('.co.uk')
    print(f'\\verb|{name}|', end=' ')
    print(f'& {w.page_count:,}', end=' ')
    for col in columns:
        if split_nv:
            print(f'& {s[col]:,}', end=' ')
        print(f'& {s[f"{col}_nodes"]:,}', end=' ')
    print('\\\\')
print('\\bottomrule')
print('total', end=' ')
print(f'& {sum(w.page_count for w in ds.verticals[0].websites):,}', end=' ')
for col in columns:
    if split_nv:
        print(f'& {sum(s[col] for s in stats):,}', end=' ')
    print(f'& {sum(s[f"{col}_nodes"] for s in stats):,}', end=' ')
print('\\\\')

## Node statistics

This section shows table with median number of DOM nodes in each website.

In [None]:
# Number of nodes (median across pages) in each website.
def get_num_nodes(page: awe.data.set.pages.Page):
    html_text = page.get_html_text()
    tree = awe.data.parsing.parse_html(html_text)
    awe.data.parsing.filter_tree(tree)
    nodes = tree.root.traverse(include_text=True)
    return sum(1 for _ in nodes)
def get_median_nodes(website: awe.data.set.pages.Website):
    return math.floor(statistics.median(
        get_num_nodes(p)
        for p in website.pages
    ))
median_stats = [
    get_median_nodes(w)
    for w in tqdm(ds.verticals[0].websites, desc='websites')
]

In [None]:
df = pd.DataFrame([
    {
        'website': w.name,
        'domain': w.get_domain(),
        'pages': w.page_count,
        'nodes': m,
    }
    for w, m in zip(ds.verticals[0].websites, median_stats)
])
print(f'Average: {df.nodes.mean()}')
print(f'std: {df.nodes.std()}')
df

## Screenshots

This section shows screenshots of pages with target nodes highlighted (by drawing their bounding boxes).

In [None]:
websites = ds.verticals[0].websites
_ = awe.data.visual.exploration.plot_websites(websites, n_cols=2)

## HTML tag statistics

This section shows the distribution of HTML tag names labeled as `images` (in the Apify dataset).

In [None]:
# Which HTML tags are labeled as images?
def get_page_dom(page: awe.data.set.pages.Page):
    page_dom = page.dom
    if page_dom.root is None:
        page_dom.init_nodes()
        page_dom.init_labels(propagate_to_leaves=True)
    return page_dom
rng = np.random.default_rng(42)
{
    w.name: collections.Counter(
        html_tag
        for p in rng.choice(w.pages, 5, replace=False)
        for html_tag in set(
            node.html_tag
            for labeled_nodes in get_page_dom(p).labeled_nodes.get('images', ())
            for node in labeled_nodes
        )
    )
    for w in tqdm(ds.verticals[0].websites, desc='websites')
}

## DOM exploration

This section begins exploring DOM of one page.

### Load a page

A page can be either loaded from the external list of invalid pages (produced by our validation code)...

In [None]:
with open('data/invalid_pages.txt', mode='r', encoding='utf-8') as f:
    file_path = f.readline().rstrip()
page = next(p for p in ds.get_all_pages() if p.original_html_path == file_path)
page

...or simply one sample selected from the dataset.

In [None]:
page = ds.verticals[0].websites[0].pages[0]
page

### Prepare page

Here, page DOM and visuals are loaded.

In [None]:
page.url, page.html_path

In [None]:
page.clear_cache(awe.data.set.pages.ClearCacheRequest())

In [None]:
page_labels = page.get_labels()
page_dom = page.cache_dom()

In [None]:
page_dom.init_nodes()
len(page_dom.nodes)

In [None]:
page_visuals = page.load_visuals()
page_visuals.fill_tree(page_dom)

In [None]:
page_dom.filter_nodes()
len(page_dom.nodes)

In [None]:
# Mark all text fragments with visuals as "sampled".
for node in page_dom.nodes:
    node.sample = node.is_text and node.box is not None

### Explore labels

This section shows labeled nodes in the page.

In [None]:
{ k: v for k, v in page.row.items() if k.startswith('selector_') }

In [None]:
page_dom.init_labels(propagate_to_leaves=True)
{
    k: [[n.get_xpath() for n in g] for g in v]
    for k, v in page_dom.labeled_nodes.items()
}

In [None]:
pd.DataFrame({
        'label_key': k,
        'xpath': n.get_xpath(),
        'text': n.parsed.text(),
        'tag': n.find_semantic_html_tag(),
        'box': n.box.as_tuple()
    }
    for k, v in page.dom.labeled_nodes.items()
    for g in v
    for n in g[:1]
)

In [None]:
{
    k: page_labels.get_label_values(k)
    for k in page_labels.label_keys
}

In [None]:
{
    k: [
        n.text()
        for n in page_labels.get_labeled_nodes(k)
    ]
    for k in page_labels.label_keys
}

### Visual neighbors

This section shows visual neighbors of target nodes (what the model will see).

In [None]:
page_dom.compute_visual_neighbors()

In [None]:
page_dom.compute_visual_neighbors_rect()

In [None]:
pd.DataFrame({
        'label_key': k,
        'text': n.parsed.text()
    } | {
        f'neighbor_{i}': (m.distance_x, m.distance_y, m.neighbor.get_text_or_tag())
        for i, m in enumerate(n.visual_neighbors)
    }
    for k, v in page.dom.labeled_nodes.items()
    for g in v
    for n in g
)

### Friend cycles

This section shows friends of target nodes (what the model will see).

In [None]:
page_dom.compute_friend_cycles(max_ancestor_distance=5)

In [None]:
price_node = page_dom.labeled_nodes['price'][0][0]
text_nodes = [n for n in price_node.traverse() if n.is_text]
[(n.text, n.partner.text if n.partner else None) for n in text_nodes]

In [None]:
target_node = text_nodes[0]
pd.DataFrame([{
    'tag': n.html_tag,
    'index': n.deep_index,
    'distance': n.deep_index - target_node.deep_index,
    'text': n.parsed.text()
 } for n in target_node.friends or ()])