Credits: Kamil Tagowski @ktagowski

In [None]:
!pip3 install datasets transformers sentencepiece evaluate

In [2]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import datasets
from transformers import T5ForConditionalGeneration, AutoTokenizer

model_name = "clarin-knext/plt5-large-poquad"
dataset = datasets.load_dataset("clarin-pl/poquad", split="train")
model = T5ForConditionalGeneration.from_pretrained(model_name, token=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading builder script:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/317 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/47.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.29M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.28G [00:00<?, ?B/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/417 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [5]:
from typing import Dict, Any, List, Tuple, Optional
from transformers import AutoTokenizer
from dataclasses import dataclass

def convert_squad_to_seq2seq_format(question: str, context: str) -> str:
    return " ".join(["question:", question.lstrip(), "context:", context.lstrip()])

def preprocess_squad_batch(
    examples: Dict[str, Any],
    question_field: str = "question",
    context_field: str = "context",
    target_field: str = "answers"
) -> Tuple[List[str], List[str]]:
    questions = examples[question_field]
    contexts = examples[context_field]
    answers = examples[target_field]

    inputs = [
        convert_squad_to_seq2seq_format(question, context)
        for question, context in zip(questions, contexts)
    ]
    targets = [answer["text"][0] if answer else "" for answer in answers]
    return inputs, targets

@dataclass
class InputsEncoder:
    tokenizer: AutoTokenizer
    max_seq_length: int


    def convert_to_features_train(
        self,
        example_batch: Dict[str, Any],
        indices: Optional[List[int]] = None
    ) -> Any:
        inputs, text_target = preprocess_squad_batch(example_batch)

        model_inputs = self.tokenizer(
            inputs, text_target=text_target, max_length=self.max_seq_length, truncation=True
        )
        return model_inputs

    def __call__(
        self,
        example_batch: Dict[str, Any],
        indices: Optional[List[int]] = None
    ) -> Any:
        return self.convert_to_features_train(
            example_batch=example_batch, indices=indices
        )

In [6]:
loader_columns = [
    'datasets_idx',
    'input_ids',
    'token_type_ids',
    'attention_mask',
    'start_positions',
    'end_positions',
    'labels'
]
columns_to_ignore = [c for c in dataset.column_names if c not in loader_columns]
columns_to_ignore

['id', 'title', 'context', 'question', 'answers']

In [7]:
encoder = InputsEncoder(tokenizer=tokenizer, max_seq_length=384)
dataset_transformed = dataset.map(
    encoder,
    batched=True,
    remove_columns=columns_to_ignore,
)

Map:   0%|          | 0/46187 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8,  # if training_args.fp16 else None,
)
dataloader = DataLoader(
    dataset_transformed, batch_size=32, shuffle=False, collate_fn=data_collator
)

In [13]:
import torch
from tqdm.autonotebook import tqdm
import numpy as np

model = model.to("cuda:0")
all_predictions = []
references = []
with torch.no_grad():
    for batch_id, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to("cuda:0") for k, v in batch.items()}
        predictions = model.generate(
            input_ids=batch["input_ids"],
            max_length=100,
            attention_mask=batch["attention_mask"],
        ).to("cpu")
        loss = model.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            decoder_input_ids=batch["decoder_input_ids"],
            labels=batch["labels"],
        ).loss.to("cpu")
        labels = batch["labels"].to("cpu")
        labels = np.where(
            batch["labels"].cpu() != -100, batch["labels"].cpu(), tokenizer.pad_token_id
        )
        decoded_input = [
            it.split("</s>")[0].replace("<pad> ", "")
            for it in tokenizer.batch_decode(batch["input_ids"].cpu())
        ]
        decoded_labels = [
            it.split("</s>")[0].replace("<pad> ", "") for it in tokenizer.batch_decode(labels)
        ]
        decoded_predictions = [
            it.split("</s>")[0].replace("<pad> ", "")
            for it in tokenizer.batch_decode(predictions)
        ]
        all_predictions.extend(
            [
                {
                    "id": str(k + batch_id * 32),
                    "prediction_text": v if v != "Nieznane" else "",
                    "no_answer_probability": 0.0,
                }
                for k, v in enumerate(decoded_predictions)
            ]
        )
        references.extend(
            [
                {"id": str(k + batch_id * 32), "answers": {"text": [v], "answer_start": [int(0)]}}
                for k, v in enumerate(decoded_labels)
            ]
        )
        break

  0%|          | 0/1444 [00:00<?, ?it/s]

In [14]:
decoded_input, decoded_labels, decoded_predictions

(['question: Co było powodem powrócenia konceptu porozumieniu monachijskiego? context: Projekty konfederacji zaczęły się załamywać 5 sierpnia 1942. Ponownie wróciła kwestia monachijska, co uaktywniło się wymianą listów Ripka – Stroński. Natomiast 17 sierpnia 1942 doszło do spotkania E. Beneša i J. Masaryka z jednej a Wł. Sikorskiego i E. Raczyńskiego z drugiej strony. Polscy dyplomaci zaproponowali podpisanie układu konfederacyjnego. W następnym miesiącu, tj. 24 września, strona polska przesłała na ręce J. Masaryka projekt deklaracji o przyszłej konfederacji obu państw. Strona czechosłowacka projekt przyjęła, lecz już w listopadzie 1942 E. Beneš podważył ideę konfederacji. W zamian zaproponowano zawarcie układu sojuszniczego z Polską na 20 lat (formalnie nastąpiło to 20 listopada 1942).',
  'question: Pomiędzy jakimi stronami odbyło się zgromadzenie w sierpniu 1942 roku? context: Projekty konfederacji zaczęły się załamywać 5 sierpnia 1942. Ponownie wróciła kwestia monachijska, co uakty

In [15]:
import evaluate

metric = evaluate.load("squad_v2")
metric.compute(predictions=all_predictions, references=references, no_answer_threshold=0)

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

{'exact': 53.125,
 'f1': 70.63235780423281,
 'total': 32,
 'HasAns_exact': 53.125,
 'HasAns_f1': 70.63235780423281,
 'HasAns_total': 32,
 'best_exact': 53.125,
 'best_exact_thresh': 0.0,
 'best_f1': 70.63235780423281,
 'best_f1_thresh': 0.0}