In [None]:
!pip install -q transformers datasets evaluate

# Question Answering

Question answering tasks return an answer given a question. There are two common types of question answering tasks:
* Extractive: extract the answer from the given context.
* Abstractie: generate an answer from the context that correctly answers the question.

## Load SQuAD dataset

In [None]:
from datasets import load_dataset

squad = load_dataset('squad', split='train[:5000]')

# split trian set into a trian and a test set
squad = squad.train_test_split(test_size=0.2)

In [None]:
squad['train'][0]

{'id': '56ceddd1aab44d1400b88b59',
 'title': 'Spectre_(2015_film)',
 'context': 'In September 2015 it was announced that Sam Smith and regular collaborator Jimmy Napes had written the film\'s title theme, "Writing\'s on the Wall", with Smith performing it for the film. Smith said the song came together in one session and that he and Napes wrote it in under half an hour before recording a demo. Satisfied with the quality, the demo was used in the final release.',
 'question': 'How many recording sessions did it take to complete the song?',
 'answers': {'text': ['one'], 'answer_start': [224]}}

* `answer`: the strating location of the answer token and the answer text.
* `context`: background information from which the model needs to extract the answer.
* `question`: the question a model should answer.

## Preprocess

We need to load a DistilBERT tokenizer to process the `question` and `context` fields.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

Note that
* Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`.
* Map the start and end positions of the answer to the original `context` by setting `return_offset_mapping=True`.
* With the mapping in hand, we can find the start and end tokens of the answer. Use the `sequence_ids` method to find which part of the offset corresponds to the `question` and which corresponds to the `context`.

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation='only_second',
        return_offsets_mapping=True,
        padding='max_length',
    )

    offset_mapping = inputs.pop('offset_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

In [None]:
tokenized_squad = squad.map(
    preprocess_function,
    batched=True,
    remove_columns=squad['train'].column_names,
)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Create a batch of examples using `DefaultDataCollator`. Unlike other data collators in Transformers library, the `DefaultDataCollator` does not apply any additional preprocessing such as padding.

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Train

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained('distilbert/distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='my_qa_model',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)


trianer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad['train'],
    eval_dataset=tokenized_squad['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

## Evaluate

Evaluation for question answering requries a significant amount of postprocessing.

## Inference

In [None]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [None]:
from transformers import pipeline

question_anwerer = pipeline('question-answering', model='stevhliu/my_awesome_qa_model')

config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



In [None]:
question_anwerer(question=question, context=context)

{'score': 0.20582668483257294,
 'start': 10,
 'end': 95,
 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}

Manually replicate the results of the `pipeline`

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_qa_model')
model = AutoModelForQuestionAnswering.from_pretrained('stevhliu/my_awesome_qa_model')



In [None]:
inputs = tokenizer(question, context, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]

tokenizer.decode(predict_answer_tokens)

'176 billion parameters and can generate text in 46 languages natural languages and 13'