# Installing libraries


In [1]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 14.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 73.3 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 71.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 

# Importing required libraries

In [2]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
import torch 
import torch.nn as nn
from transformers import DefaultDataCollator
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from evaluate import evaluator

# Loading dataset

In [3]:
training_dataset = load_dataset("squad", split="train[:5000]")
validation_dataset = load_dataset("squad", split="validation[:1000]")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

  

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.




In [4]:
training_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [44]:
training_dataset.info

DatasetInfo(description='Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n', citation='@article{2016arXiv160605250R,\n       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},\n                 Konstantin and {Liang}, Percy},\n        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",\n      journal = {arXiv e-prints},\n         year = 2016,\n          eid = {arXiv:1606.05250},\n        pages = {arXiv:1606.05250},\narchivePrefix = {arXiv},\n       eprint = {1606.05250},\n}\n', homepage='https://rajpurkar.github.io/SQuAD-explorer/', license='', features={'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), leng

In [45]:
training_dataset.column_names

['input_ids', 'attention_mask', 'start_positions', 'end_positions']

In [5]:
validation_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1000
})

In [46]:
validation_dataset.info

DatasetInfo(description='Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n', citation='@article{2016arXiv160605250R,\n       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},\n                 Konstantin and {Liang}, Percy},\n        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",\n      journal = {arXiv e-prints},\n         year = 2016,\n          eid = {arXiv:1606.05250},\n        pages = {arXiv:1606.05250},\narchivePrefix = {arXiv},\n       eprint = {1606.05250},\n}\n', homepage='https://rajpurkar.github.io/SQuAD-explorer/', license='', features={'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), leng

In [47]:
validation_dataset.column_names

['input_ids', 'attention_mask', 'start_positions', 'end_positions']

In [6]:
sample_idx = 29
training_dataset[sample_idx]

{'id': '5733ac31d058e614000b5ff3',
 'title': 'University_of_Notre_Dame',
 'context': "The Joan B. Kroc Institute for International Peace Studies at the University of Notre Dame is dedicated to research, education and outreach on the causes of violent conflict and the conditions for sustainable peace. It offers PhD, Master's, and undergraduate degrees in peace studies. It was founded in 1986 through the donations of Joan B. Kroc, the widow of McDonald's owner Ray Kroc. The institute was inspired by the vision of the Rev. Theodore M. Hesburgh CSC, President Emeritus of the University of Notre Dame. The institute has contributed to international policy discussions about peace building practices.",
 'question': 'What institute at Notre Dame studies  the reasons for violent conflict?',
 'answers': {'text': ['Joan B. Kroc Institute for International Peace Studies'],
  'answer_start': [4]}}

# Defining model

In [7]:
#Here we are using 5 different models, uncomment one specific model and run the code and comment other models.
model_name = "deepset/roberta-base-squad2"
#model_name = "deepset/electra-base-squad2"
#model_name = "deepset/deberta-v3-base-squad2"
#model_name = "deepset/bert-base-uncased-squad2"
#model_name = "deepset/bert-base-cased-squad2"

# Defining Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

# Preprocessing dataset

1.   Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the context by setting truncation="only_second".
2.   Next, map the start and end positions of the answer to the original context by setting return_offset_mapping=True.
3.   With the mapping in hand, now you can find the start and end tokens of the answer. Use the sequence_ids method to find which part of the offset corresponds to the question and which corresponds to the context.

In [9]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(questions, examples["context"], max_length=384, truncation="only_second", return_offsets_mapping=True, padding="max_length")
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
training_dataset = training_dataset.map(preprocess_function, batched=True, remove_columns=training_dataset.column_names)
validation_dataset = validation_dataset.map(preprocess_function, batched=True, remove_columns=validation_dataset.column_names)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
data_collator = DefaultDataCollator()

# Train model

In [12]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [13]:
training_args = TrainingArguments(
    output_dir=model_name,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 939
  Number of trainable parameters = 124056578


Epoch,Training Loss,Validation Loss
1,No log,0.98593
2,0.507200,1.085036
3,0.507200,1.181766


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to deepset/roberta-base-squad2/checkpoint-500
Configuration saved in deepset/roberta-base-squad2/checkpoint-500/config.json
Model weights saved in deepset/roberta-base-squad2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in deepset/roberta-base-squad2/checkpoint-500/tokenizer_config.json
Special tokens file saved in deepset/roberta-base-squad2/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=939, training_loss=0.401460446488743, metrics={'train_runtime': 1160.2226, 'train_samples_per_second': 12.929, 'train_steps_per_second': 0.809, 'total_flos': 2939588513280000.0, 'train_loss': 0.401460446488743, 'epoch': 3.0})

# Testing model predictions

In [16]:
val_dataset = load_dataset("squad", split="validation[:1000]")



In [17]:
val_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1000
})

In [18]:
sample_idx = 729

In [19]:
val_dataset[sample_idx]

{'id': '56bec9133aeaaa14008c9447',
 'title': 'Super_Bowl_50',
 'context': "On Carolina's next possession fullback Mike Tolbert lost a fumble while being tackled by safety Darian Stewart, which linebacker Danny Trevathan recovered on the Broncos 40-yard line. However, the Panthers soon took the ball back when defensive end Kony Ealy tipped a Manning pass to himself and then intercepted it, returning the ball 19 yards to the Panthers 39-yard line with 1:55 left on the clock. The Panthers could not gain any yards with their possession and had to punt. After a Denver punt, Carolina drove to the Broncos 45-yard line. But with 11 seconds left, Newton was sacked by DeMarcus Ware as time expired in the half.",
 'question': 'Which Panther tipped a Manning pass to himself and picked it off?',
 'answers': {'text': ['Kony Ealy', 'Kony Ealy', 'Ealy'],
  'answer_start': [249, 249, 254]}}

In [20]:
question = val_dataset[sample_idx]["question"]
context = val_dataset[sample_idx]["context"]

In [21]:
question

'Which Panther tipped a Manning pass to himself and picked it off?'

In [22]:
context

"On Carolina's next possession fullback Mike Tolbert lost a fumble while being tackled by safety Darian Stewart, which linebacker Danny Trevathan recovered on the Broncos 40-yard line. However, the Panthers soon took the ball back when defensive end Kony Ealy tipped a Manning pass to himself and then intercepted it, returning the ball 19 yards to the Panthers 39-yard line with 1:55 left on the clock. The Panthers could not gain any yards with their possession and had to punt. After a Denver punt, Carolina drove to the Broncos 45-yard line. But with 11 seconds left, Newton was sacked by DeMarcus Ware as time expired in the half."

In [23]:
inputs = tokenizer(question, context, return_tensors="pt")
inputs_cuda = {key: tensor.cuda() for key, tensor in inputs.items()}

In [24]:
with torch.no_grad():
    outputs = model(**inputs_cuda)

In [25]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-6.2589, -8.4990, -6.6036, -9.1918, -9.2627, -9.2082, -9.3619, -9.2294,
         -9.3128, -9.3217, -9.1336, -9.3302, -9.2592, -9.1943, -9.2890, -9.4034,
         -8.5893, -7.1069, -9.3624, -8.9657, -8.9769, -8.7694, -9.0079, -9.1923,
         -9.5605, -9.2370, -9.2441, -9.1910, -9.3819, -9.4325, -9.2845, -9.2672,
         -8.9997, -8.7468, -9.1490, -8.9883, -9.5440, -9.2657, -9.0762, -8.8289,
         -9.0313, -9.3921, -9.4235, -9.1885, -9.0333, -8.9610, -8.6891, -9.1670,
         -9.4173, -9.4671, -9.5093, -9.3743, -9.4167, -9.4855, -8.4654, -7.1154,
         -8.8062, -8.9742, -8.8045, -8.9997, -9.2824, -9.0048, -1.7134, -4.7049,
          4.8360, -4.2458, -1.4371, -1.9178, -8.6660, -9.2727, -9.3032, -9.4319,
         -9.1841, -9.2383, -9.3934, -9.2800, -8.5158, -9.3317, -9.3775, -8.7096,
         -8.9587, -9.2558, -9.0214, -9.6173, -9.1504, -8.9151, -8.8309, -8.9030,
         -9.3606, -9.4310, -9.5460, -8.8691, -8.3377, -9

In [35]:
answer_start = outputs.start_logits.argmax()
answer_end = outputs.end_logits.argmax()

In [36]:
context

"On Carolina's next possession fullback Mike Tolbert lost a fumble while being tackled by safety Darian Stewart, which linebacker Danny Trevathan recovered on the Broncos 40-yard line. However, the Panthers soon took the ball back when defensive end Kony Ealy tipped a Manning pass to himself and then intercepted it, returning the ball 19 yards to the Panthers 39-yard line with 1:55 left on the clock. The Panthers could not gain any yards with their possession and had to punt. After a Denver punt, Carolina drove to the Broncos 45-yard line. But with 11 seconds left, Newton was sacked by DeMarcus Ware as time expired in the half."

In [37]:
question

'Which Panther tipped a Manning pass to himself and picked it off?'

In [39]:
predicted_answer = inputs.input_ids[0, answer_start : answer_end + 1]
tokenizer.decode(predicted_answer)

' Kony Ealy'

# Evaluating model on validation dataset

In [30]:
trainer.evaluate(validation_dataset)

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'eval_loss': 1.1817659139633179,
 'eval_runtime': 23.017,
 'eval_samples_per_second': 43.446,
 'eval_steps_per_second': 2.737,
 'epoch': 3.0}

In [31]:
task_evaluator = evaluator("question-answering")

In [32]:
model

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [41]:
result = task_evaluator.compute(model_or_pipeline=model, tokenizer=tokenizer, data=val_dataset, metric="squad_v2", squad_v2_format=True)

In [42]:
result

{'exact': 89.3,
 'f1': 92.7106662274923,
 'total': 1000,
 'HasAns_exact': 89.3,
 'HasAns_f1': 92.7106662274923,
 'HasAns_total': 1000,
 'best_exact': 89.3,
 'best_exact_thresh': 0.9999721050262451,
 'best_f1': 92.7106662274923,
 'best_f1_thresh': 0.9999721050262451,
 'total_time_in_seconds': 16.367767864000143,
 'samples_per_second': 61.09568563710119,
 'latency_in_seconds': 0.016367767864000144}