In [None]:
from awe.data import swde, qa_dataset

In [None]:
sds = swde.Dataset(suffix='-exact')
websites = sds.verticals[0].websites

In [None]:
page = websites[0].pages[0]
page.file_path

In [None]:
import selectolax

In [None]:
tree = selectolax.parser.HTMLParser(page.html)

In [None]:
tree.strip_tags([
    'script',
    'style',
    'head',
    '[document]',
    'noscript',
    'iframe'
])

In [None]:
nodes_text = qa_dataset.collapse_whitespace(' '.join(qa_dataset.collapse_whitespace(n.text(deep=False)) for n in tree.body.traverse()))
body_text = qa_dataset.collapse_whitespace(tree.body.text(separator=' '))
len(nodes_text), len(body_text)

In [None]:
import importlib
import pandas as pd
from awe import selectolax_utils, qa_model
_ = importlib.reload(selectolax_utils)

In [None]:
pipeline = qa_model.QaPipeline()

In [None]:
tokenizer = pipeline.tokenizer

In [None]:
df = pd.DataFrame([
    {
        'xpath': selectolax_utils.get_xpath(node),
        'text': text,
        'tokens': len(tokenizer(text, add_special_tokens=False, return_attention_mask=False)['input_ids'])
    }
    for node in tree.body.traverse(include_text=True)
    if selectolax_utils.is_text(node) and not (text := qa_dataset.collapse_whitespace(node.text(deep=False))).isspace() and text != ''
])
df

In [None]:
df['tokens'].sum()

In [None]:
separate_tokens = [t for ts in tokenizer(list(df['text']), add_special_tokens=False, return_attention_mask=False)['input_ids'] for t in ts]
len(separate_tokens)

In [None]:
split_words_tokens = tokenizer(list(df['text']), add_special_tokens=False, is_split_into_words=True)['input_ids']
len(split_words_tokens)

In [None]:
separate_tokens == split_words_tokens

In [None]:
joint_tokens = tokenizer(' '.join(list(df['text'])), add_special_tokens=False)['input_ids']
len(joint_tokens)

In [None]:
joint_tokens_2 = tokenizer(df['text'].str.cat(sep=' '), add_special_tokens=False, return_attention_mask=False)['input_ids']
len(joint_tokens_2)

In [None]:
joint_tokens == joint_tokens_2

In [None]:
import awe.qa.parser
import awe.qa.collater
import awe.qa.sampler
import awe.qa.collater_validator
for module in [awe.qa.parser, awe.qa.collater, awe.qa.sampler, awe.qa.collater_validator]:
    importlib.reload(module)

In [None]:
samples = awe.qa.sampler.get_samples([page])
collater = awe.qa.collater.Collater(tokenizer)

In [None]:
texts = [awe.qa.parser.get_page_words(sample.page) for sample in samples]

In [None]:
[(i, word) for i, word in enumerate(texts[0]) if word == '$9,970']

In [None]:
encodings = collater(samples)
encodings

In [None]:
[(s.label, s.values, tokenizer.decode(encodings['input_ids'][i, encodings['start_positions'][i]:encodings['end_positions'][i] + 1])) for i, s in enumerate(samples)]

In [None]:
awe.qa.collater_validator.validate(websites[0].pages, collater)