In [None]:
import importlib

import torch

In [None]:
# Test whether progress bars work.
from tqdm.auto import tqdm
_ = list(tqdm(range(1)))

## Load dataset

In [None]:
from awe.data import qa_dataset, swde

_ = importlib.reload(qa_dataset)

In [None]:
sds = swde.Dataset(suffix='-exact')

In [None]:
qa_dataset.prepare_dataset(sds.verticals[0].websites[0].pages[:10])

## Load data

In [None]:
from awe.data import swde

In [None]:
sds = swde.Dataset(suffix='-exact')

In [None]:
page = sds.verticals[0].websites[0].pages[0]
page.identifier

In [None]:
{ f.name: page.get_groundtruth_texts(f.name) for f in page.site.groundtruth }

### Extract text using `selectolax`

In [None]:
from selectolax.parser import HTMLParser

In [None]:
tree = HTMLParser(page.contents)
for tag in ['script', 'style', 'head', '[document]']:
    for element in tree.css(tag):
        element.decompose()
page_text = tree.body.text(separator='\n')
len(page_text)

### Extract text using `BeautifulSoup`

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(page.contents)
for element in soup(['style', 'script', 'head', '[document]']):
    element.extract()
page_text = soup.get_text(separator='\n')
len(page_text)

## Invoke Transformer

In [None]:
model_id = 'vasudevgupta/bigbird-roberta-natural-questions'

### Use `pipeline`

In [None]:
from transformers import pipeline

In [None]:
qa = pipeline('question-answering', model_id)

In [None]:
qa('car\'s name', page_text)

### Manually create model

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
len(tokenizer(page_text)['input_ids'])

In [None]:
#tokenizer.decode(tokenizer('price', page_text, truncation=True)['input_ids'])

In [None]:
encoded_input = tokenizer('How much does it cost?', page_text, return_tensors='pt')
encoded_input.keys()

In [None]:
outputs = model(**encoded_input)
outputs

In [None]:
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits)

In [None]:
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][answer_start:answer_end]))