In [None]:
import awe.utils
awe.utils.init_notebook()

In [None]:
import collections
import os

import matplotlib.pyplot as plt
import pandas as pd
from tqdm.auto import tqdm

In [None]:
import awe.data.set.apify
import awe.data.set.pages
import awe.data.set.swde
import awe.data.validation
import awe.data.visual.exploration
awe.utils.reload('awe.data')

In [None]:
ds = awe.data.set.apify.Dataset(
    # only_label_keys=('name', 'price', 'category', 'images'),
    # convert=False,
    # only_websites=('etsyEn',),
)

In [None]:
pd.DataFrame(p.row for p in ds.get_all_pages()[:10])

In [None]:
ds = awe.data.set.swde.Dataset(suffix='-exact', only_verticals=('auto',))

In [None]:
pd.DataFrame(p.to_row() for p in ds.get_all_pages()[:10])

In [None]:
def get_label_stats(page: awe.data.set.pages.Page):
    page_labels = page.get_labels()
    return sum(
        (
            collections.Counter({
                k: len(page_labels.get_label_values(k)),
                #f'{k}_nodes': len(page_labels.get_labeled_nodes(k)),
            })
            for k in page_labels.label_keys
        ),
        collections.Counter()
    )

stats = [
    sum(
        (
            get_label_stats(p)
            for p in tqdm(w.pages, desc=w.name, disable=True)
            if w.page_count != 0
        ),
        collections.Counter()
    )
    for w in tqdm(ds.verticals[0].websites, desc='websites')
]
keys = { k for s in stats for k in s.keys() }

In [None]:
pd.DataFrame([
    {
        'website': w.name,
        'pages': w.page_count
    }
    | {
        k: 0
        for k in keys
    } | {
        k: c #f'{c} ({c / w.page_count:.0%})'
        for k, c in s.items()
    }
    for w, s in zip(ds.verticals[0].websites, stats)
])

In [None]:
page = next(p for p in ds.get_all_pages() if os.path.exists(p.screenshot_path))
awe.data.visual.exploration.plot_screenshot_with_boxes(page)

In [None]:
with open('data/invalid_pages.txt', mode='r', encoding='utf-8') as f:
    file_path = f.readline()
page = next(p for p in ds.get_all_pages() if p.html_path == file_path)
page

In [None]:
page = ds.verticals[0].websites[0].pages[0]
page

In [None]:
page.url, page.html_path

In [None]:
page.clear_cache(awe.data.set.pages.ClearCacheRequest())

In [None]:
page_labels = page.get_labels()
page_dom = page.cache_dom()

In [None]:
page_dom.init_nodes()
len(page_dom.nodes)

In [None]:
page_visuals = page.load_visuals()
page_visuals.fill_tree(page_dom)

In [None]:
page_dom.filter_nodes()
len(page_dom.nodes)

In [None]:
{ k: v for k, v in page.row.items() if k.startswith('selector_') }

In [None]:
page_dom.init_labels(propagate_to_leaves=True)
{
    k: [n.get_xpath() for n in v]
    for k, v in page_dom.labeled_nodes.items()
}

In [None]:
pd.DataFrame({
        'label_key': k,
        'xpath': n.get_xpath(),
        'text': n.parsed.text(),
        'tag': n.semantic_html_tag,
        'box': n.box
    }
    for k, v in page.dom.labeled_nodes.items()
    for n in v
)

In [None]:
page_dom.compute_visual_neighbors()

In [None]:
page_dom.compute_visual_neighbors_rect()

In [None]:
pd.DataFrame({
        'label_key': k,
        'text': n.parsed.text()
    } | {
        f'neighbor_{i}': (m.distance_x, m.distance_y, m.neighbor.text)
        for i, m in enumerate(n.visual_neighbors)
    }
    for k, v in page.dom.labeled_nodes.items()
    for n in v
)

In [None]:
page_dom.compute_friend_cycles(max_ancestor_distance=5)

In [None]:
price_node = page_dom.labeled_nodes['price'][0]
text_nodes = [n for n in price_node.traverse() if n.is_text]
[(n.text, n.partner.text if n.partner else None) for n in text_nodes]

In [None]:
target_node = text_nodes[0]
pd.DataFrame([{
    'tag': n.html_tag,
    'index': n.deep_index,
    'distance': n.deep_index - target_node.deep_index,
    'text': n.parsed.text()
 } for n in target_node.friends or ()])

In [None]:
{
    k: page_labels.get_label_values(k)
    for k in page_labels.label_keys
}

In [None]:
{
    k: [
        n.text()
        for n in page_labels.get_labeled_nodes(k)
    ]
    for k in page_labels.label_keys
}

In [None]:
pages = ds.get_all_pages(zip_verticals=True, zip_websites=True)
validator = awe.data.validation.Validator(visuals=False)
validator.validate_pages(pages)