### QA. SQUAD dataset

You can work with Squad datasets (any other available in hugging face) in different formats. 
* via library datasets
* via already prepared pipelines
* directly with data loaders to classify and finetune. 



In [None]:
!pip install datasets
!pip install transformers

In [2]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric
from pprint import pprint

datasets = list_datasets()
metrics = list_metrics()

# Print all the available datasets
pprint(datasets[:10], compact=True)
# List all the available metrics
pprint(metrics, compact=True)

['assin', 'ar_res_reviews', 'ambig_qa', 'bianet', 'ag_news', 'ajgt_twitter_ar',
 'aeslc', 'bc2gm_corpus', 'air_dialogue', 'acronym_identification']
['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'chrf', 'code_eval', 'comet',
 'competition_math', 'coval', 'cuad', 'exact_match', 'f1', 'frugalscore',
 'gleu', 'glue', 'google_bleu', 'indic_glue', 'mae', 'mahalanobis',
 'matthews_correlation', 'mauve', 'mean_iou', 'meteor', 'mse', 'pearsonr',
 'perplexity', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari', 'seqeval',
 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'wer', 'wiki_split',
 'xnli', 'xtreme_s']


In [4]:
# Load a dataset and print the first example in the training set
squad_dataset = load_dataset('squad')

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
print(squad_dataset["train"])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})


In [7]:
# Load a metric
squad_metric = load_metric('squad')

In [8]:
# Process the dataset - add a column with the length of the context texts
dataset_with_length = squad_dataset.map(lambda x: {"length": len(x["context"])})

Loading cached processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-43e42ded5a4064cc.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-359b86046b00fe4b.arrow


In [9]:
# Process the dataset - tokenize the context texts
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenized_dataset = squad_dataset.map(lambda x: tokenizer(x['context']), batched=True)
tokenized_dataset["train"]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

  0%|          | 0/88 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/11 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 87599
})

In [10]:
squad_dataset = list_datasets(with_details=True)[datasets.index('squad')]

pprint(squad_dataset.__dict__)

{'author': None,
 'cardData': {'annotations_creators': ['crowdsourced'],
              'language_creators': ['crowdsourced', 'found'],
              'languages': ['en'],
              'licenses': ['cc-by-4-0'],
              'multilinguality': ['monolingual'],
              'paperswithcode_id': 'squad',
              'pretty_name': 'SQuAD',
              'size_categories': ['10K<n<100K'],
              'source_datasets': ['extended|wikipedia'],
              'task_categories': ['question-answering'],
              'task_ids': ['extractive-qa']},
 'citation': '@article{2016arXiv160605250R,\n'
             '       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and '
             '{Lopyrev},\n'
             '                 Konstantin and {Liang}, Percy},\n'
             '        title = "{SQuAD: 100,000+ Questions for Machine '
             'Comprehension of Text}",\n'
             '      journal = {arXiv e-prints},\n'
             '         year = 2016,\n'
             '          eid 

In [11]:
import torch 
from datasets import load_dataset

dataset = load_dataset('squad')

def get_correct_alignement(context, answer):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(gold_text)
    if context[start_idx:end_idx] == gold_text:
        return start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        return start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        return start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

# Tokenize our training dataset
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer(example_batch['context'], example_batch['question'], truncation=True)

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.
    start_positions, end_positions = [], []
    for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
        start_idx, end_idx = get_correct_alignement(context, answer)
        start_positions.append(encodings.char_to_token(i, start_idx))
        end_positions.append(encodings.char_to_token(i, end_idx-1))
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

encoded_dataset = dataset.map(convert_to_features, batched=True)

# Format our dataset to outputs torch.Tensor to train a pytorch model
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
encoded_dataset.set_format(type='torch', columns=columns)

# Instantiate a PyTorch Dataloader around our dataset
# Let's do dynamic batching (pad on the fly with our own collate_fn)
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors='pt')
dataloader = torch.utils.data.DataLoader(encoded_dataset['train'], collate_fn=collate_fn, batch_size=8)

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [12]:
from transformers import BertForQuestionAnswering
import torch

# Bert Model with a span classification head on top for extractive question-answering tasks 
# a linear layers on top of the hidden-states output to compute span start logits and span end logits
model = BertForQuestionAnswering.from_pretrained('bert-base-cased', return_dict=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [13]:
# Now let's train our model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.train().to(device)
for i, batch in enumerate(dataloader):
    batch.to(device)
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    model.zero_grad()
    print(f'Step {i} - loss: {loss:.3}')
    if i > 3:
        break

Step 0 - loss: 5.75
Step 1 - loss: 5.74
Step 2 - loss: 5.13
Step 3 - loss: 5.68
Step 4 - loss: 5.46


#### pipelines

In [14]:
from transformers import pipeline

nlp = pipeline("question-answering")

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
"""

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [15]:
result = nlp(question="What is extractive question answering?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

result = nlp(question="What is a good example of a question answering dataset?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'the task of extracting an answer from a text given a question', score: 0.6226, start: 34, end: 95
Answer: 'SQuAD dataset', score: 0.5053, start: 147, end: 160


Here is an example of question answering using a model and a tokenizer.

In [16]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it with the weights stored in the checkpoint.
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")


# Define a text and a few questions.
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
     "How many pretrained models are available in 🤗 Transformers?",
     "What does 🤗 Transformers provide?",
     "🤗 Transformers provides interoperability between which frameworks?",
]

# Iterate over the questions and build a sequence from the text and the current question, 
# with the correct model-specific separators token type ids and attention masks.
for question in questions:
     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
     input_ids = inputs["input_ids"].tolist()[0]

     # Pass this sequence through the model. This outputs a range of scores across the entire
     # sequence tokens (question and text), for both the start and end positions.
     outputs = model(**inputs)
     answer_start_scores = outputs.start_logits
     answer_end_scores = outputs.end_logits

     # Compute the softmax of the result to get probabilities over the tokens.
     answer_start = torch.argmax(
         answer_start_scores
     )  # Get the most likely beginning of answer with the argmax of the score
     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

     print(f"Question: {question}")
     print(f"Answer: {answer}")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Question: How many pretrained models are available in 🤗 Transformers?
Answer: over 32 +
Question: What does 🤗 Transformers provide?
Answer: general - purpose architectures
Question: 🤗 Transformers provides interoperability between which frameworks?
Answer: tensorflow 2. 0 and pytorch


### Russian QA models

You can you DeepPavlov models for Russian QA.
* For SQAUD dataset http://docs.deeppavlov.ai/en/master/features/models/squad.html
* For KBQA http://docs.deeppavlov.ai/en/master/features/models/kbqa.html
* Fir ODQA http://docs.deeppavlov.ai/en/master/features/skills/odqa.html


In [None]:
!pip install deeppavlov

##### Restrt runtime here

In [None]:
# Demo is here https://demo.deeppavlov.ai/#/ru/textqa

!python -m deeppavlov install squad_ru_rubert_infer
# !python -m deeppavlov download squad_ru_rubert_infer

In [None]:
!pip install --upgrade tensorflow-gpu==2.2.0 --user
!pip uninstall tensorflow-datasets
!pip install tensorflow-datasets==4.0.0
# reconnect colab

##### Restrt runtime here

In [None]:
from deeppavlov import build_model, configs

model = build_model(configs.squad.squad_ru_rubert_infer, download=True)

In [3]:
res = model(['DeepPavlov это библиотека для NLP задач и постороения диалоговых систем.'], ['Что такое DeepPavlov?'])

In [4]:
res

[['библиотека для NLP задач и постороения диалоговых систем'],
 [15],
 [1729423.0]]

In [5]:
!git clone https://github.com/deepmipt/DeepPavlov.git

Cloning into 'DeepPavlov'...
remote: Enumerating objects: 62067, done.[K
remote: Counting objects: 100% (861/861), done.[K
remote: Compressing objects: 100% (534/534), done.[K
remote: Total 62067 (delta 592), reused 463 (delta 320), pack-reused 61206[K
Receiving objects: 100% (62067/62067), 38.82 MiB | 21.21 MiB/s, done.
Resolving deltas: 100% (47741/47741), done.


In [6]:
!cat ./DeepPavlov/deeppavlov/configs/squad/squad_ru_rubert_infer.json

{
  "dataset_reader": {
    "class_name": "squad_dataset_reader",
    "dataset": "SberSQuADClean",
    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
  },
  "dataset_iterator": {
    "class_name": "squad_iterator",
    "seed": 1337,
    "shuffle": true
  },
  "chainer": {
    "in": ["context_raw", "question_raw"],
    "in_y": ["ans_raw", "ans_raw_start"],
    "pipe": [
      {
        "class_name": "squad_bert_infer",
        "lang": "ru",
        "batch_size": 128,
        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_rubert.json",
        "vocab_file": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/vocab.txt",
        "do_lower_case": false,
        "max_seq_length": 256,
        "in": ["context_raw", "question_raw"],
        "out": ["ans_predicted", "ans_start_predicted", "logits"]
      }
    ],
    "out": ["ans_predicted", "ans_start_predicted", "logits"]
  },
  "train": {
    "sh

In [7]:
# Training. Check config!
from deeppavlov import train_model 

model = train_model("./DeepPavlov/deeppavlov/configs/squad/squad_ru_rubert_infer.json", download=True)