In [1]:
import evaluate

from datasets import load_dataset, concatenate_datasets
from stanza import Pipeline
from statistics import mode
from tqdm import tqdm
from underthesea import word_tokenize, sent_tokenize

In [2]:
viquad = load_dataset("phatjk/viquad")
viquad = (
    concatenate_datasets([viquad["train"], viquad["test"]])
    .rename_columns({"Id": "id", "ans_start": "start", "text": "answer"})
    .remove_columns(["__index_level_0__"])
)
viquad

Dataset({
    features: ['id', 'title', 'context', 'question', 'start', 'answer'],
    num_rows: 23305
})

In [3]:
viquad_temp = viquad.select(range(100))
viquad_temp

Dataset({
    features: ['id', 'title', 'context', 'question', 'start', 'answer'],
    num_rows: 100
})

In [4]:
padding = len(str(len(viquad)))
new_idx = [f"viquad_{str(idx).zfill(padding)}" for idx in range(len(viquad))]


def get_new_id(entry):
    entry["id"] = new_idx[0]
    new_idx.pop(0)
    return entry


viquad = viquad.map(get_new_id)

In [5]:
SENT_TOKENIZE = Pipeline(
    lang="vi",
    processors="tokenize",
    use_gpu=True,
    device=0,
    verbose=False,
    allow_unknown_language=True,
    tokenize_no_ssplit=True,
)
NER = Pipeline(
    lang="vi",
    processors="tokenize, ner",
    use_gpu=True,
    device=0,
    verbose=False,
    allow_unknown_language=True,
    tokenize_pretokenized=True,
    tokenize_no_ssplit=True,
)
PARSER = Pipeline(
    lang="vi",
    processors="tokenize, pos, constituency",
    use_gpu=True,
    device=0,
    verbose=False,
    allow_unknown_language=True,
)
rouge = evaluate.load("rouge")

In [14]:
KEYS = viquad.column_names.extend(["type", "cloze_question"])
THRESHOLD = 0.8
POS_TAGS = ["NP", "AP", "VP", "S", "Num"]
POS_REPLACE = dict(
    {
        "Num": "NUMBER",
        "NP": "NOUNPHRASE",
        "AP": "ADVPHRASE",
        "VP": "VERBPHARSE",
        "S": "CLAUSE",
    }
)


def text_tokenize(text):
    return "\n\n".join(["\n".join(word_tokenize(sent)) for sent in sent_tokenize(text)])


def get_ner(text, threshold=THRESHOLD):
    ner = NER(text_tokenize(text)).entities
    score = rouge.compute(
        predictions=[" ".join([ent.text for ent in ner])],
        references=[text],
    )
    return mode([ent.type for ent in ner]) if score["rougeL"] >= threshold else None


def get_labels(node):
    if node.is_leaf():
        return None
    if node.label in POS_TAGS:
        return node.label
    else:
        for child in node.children:
            return get_labels(child)


def get_pos(text):
    doc = PARSER(" ".join(word_tokenize(text)))
    pos = []
    for sent in doc.sentences:
        labels = get_labels(sent.constituency)
        if labels is not None:
            pos.append(labels)
    return (
        "MISCELLANEOUS"
        if len(pos) == 0
        else POS_REPLACE[mode([tag for tag in pos if tag is not None])]
    )


def get_type(answer):
    ner = get_ner(answer)
    return ner if ner is not None else get_pos(answer)


def get_type_batch(batch):
    return [get_type(entry) for entry in batch]


def get_cloze_question(entry):
    end_char = 0
    for sent in SENT_TOKENIZE("\n\n".join(sent_tokenize(entry["context"]))).sentences:
        end_char += sent.tokens[-1].end_char
        if entry["start"] <= end_char and entry["answer"] in sent.text:
            return sent.text.replace(entry["answer"], entry["type"])
    return entry["context"].replace(entry["answer"], entry["type"])


def get_cloze_question_batch(batch):
    entries = []
    for i in range(len(batch[KEYS[0]])):
        entry = {key: batch[key][i] for key in KEYS}
        # for key in keys:
        #     entry[key] = batch[key][i]
        entries.append(entry)

    return [" ".join(word_tokenize(get_cloze_question(entry))) for entry in entries]

In [7]:
viquad_temp = viquad.select(range(100))
viquad_temp

Dataset({
    features: ['id', 'title', 'context', 'question', 'start', 'answer'],
    num_rows: 100
})

In [8]:
viquad_temp = viquad_temp.map(
    lambda batch: {"type": get_type_batch(batch["answer"])},
    batched=True,
    batch_size=int(len(viquad_temp) / 10),
)
viquad_temp

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'start', 'answer', 'type'],
    num_rows: 100
})

In [15]:
viquad_temp = viquad_temp.map(
    lambda batch: {"cloze_question": get_cloze_question_batch(batch)},
    batched=True,
    batch_size=int(len(viquad_temp) / 10),
)
viquad_temp

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'start', 'answer', 'type', 'cloze_question'],
    num_rows: 100
})