In [None]:
import importlib

import numpy as np

from awe import filtering, features, html_utils, utils
from awe.data import swde, dataset

for module in [filtering, dataset, swde, features, html_utils, utils]:
    importlib.reload(module)

## Choose dataset

In [None]:
SUBSET = slice(1)
sds = swde.Dataset(suffix='-exact')
websites = sds.verticals[0].websites
rng = np.random.default_rng(42)
website_indices = rng.choice(len(websites), 5, replace=False)
train_pages = [
    p for i in website_indices
    for p in rng.choice(websites[i].pages, 1_000)
]
val_pages = [
    p for i in range(len(websites))
    if i not in website_indices
    for p in rng.choice(websites[i].pages, 100)
]
ds = dataset.DatasetCollection()
ds.create('train', train_pages[SUBSET], shuffle=True)
ds.create('val_unseen', val_pages[SUBSET])
ds.create('val_seen', rng.choice(train_pages[SUBSET], SUBSET.stop or 200))
ds.get_lengths()

In [None]:
ds.summarize_pages_without_visual_features()

## Validate

In [None]:
page = train_pages[0]
page: swde.Page
page.identifier

In [None]:
sds.validate(
    pages=[page],
)

## Inspect node attributes

In [None]:
ctx = features.PageContextBase(
    page,
    filtering.DefaultNodePredicate()
)
page.prepare(ctx)

In [None]:
labels = page.labels
labels.nodes

In [None]:
nodes = labels.get_nodes('price', ctx)
assert len(nodes) == 1
node = nodes[0]
node

In [None]:
node.copy_visual_features()
node

## Convert to features

In [None]:
from awe.features import extraction

utils.reload('awe.features', 'awe.visual')
ds.reload_root_context()

In [None]:
ds.features = [
    features.Depth(),
    features.IsLeaf(),
    features.CharCategories(),
    features.Visuals(),
    features.CharIdentifiers(),
    features.WordIdentifiers()
]

In [None]:
node

In [None]:
ctx = ds.prepare_page_context(page)
fs = extraction.PageFeatureExtractor(ds['train'], ctx).describe()
fs[node.xpath]

In [None]:
ds.live.root.describe_visual_categorical()