In [1]:
import numpy as np
from tqdm.auto import tqdm
import collections
import torch
from datasets import load_dataset
from config import QAConfig
import evaluate
from data_utils import SquadDataProcessor 

IndentationError: expected an indented block after 'if' statement on line 66 (evaluate.py, line 67)

In [None]:
cfg = QAConfig()
cfg

In [None]:
data_processor = SquadDataProcessor(cfg)

## Load and tokenize train data

In [16]:
raw_train_dataset = load_dataset(cfg.DATASET_NAME, split ="train", num_proc=cfg.NUM_PROC)
train_data = data_processor.process_data(raw_train_dataset, data_type="train")

Processing train data


Tokenizing train data (num_proc=8):   0%|          | 0/130319 [00:00<?, ? examples/s]

In [17]:
raw_train_dataset, train_data

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 130319
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
     num_rows: 131754
 }))

In [18]:
raw_train_dataset[0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [19]:
train_data[0].keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [20]:
# visualize how data was processed
data_processor.tokenizer.decode(train_data[0]["input_ids"])

'[CLS] when did beyonce start becoming popular? [SEP] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [21]:
len(train_data[0]["input_ids"])

384

## Load and tokenize val data

Preprocessing the validation data will be slightly easier as we don’t need to generate labels (unless we want to compute a validation loss, but that number won’t really help us understand how good the model is). The real joy will be to interpret the predictions of the model into spans of the original context. For this, we will just need to store both the offset mappings and some way to match each created feature to the original example it comes from. Since there is an ID column in the original dataset, we’ll use that ID.

In [9]:
raw_val_dataset = load_dataset(cfg.DATASET_NAME, split ="validation", num_proc=cfg.NUM_PROC)
val_data = data_processor.process_data(raw_val_dataset, data_type="validation")

Processing validation data


Tokenizing validation data (num_proc=8):   0%|          | 0/11873 [00:00<?, ? examples/s]

In [10]:
raw_val_dataset, val_data

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 11873
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
     num_rows: 12134
 }))

In [11]:
raw_val_dataset[0]

{'id': '56ddde6b9a695914005b9628',
 'title': 'Normans',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'In what country is Normandy located?',
 'answers': {'text': ['France', 'France', 'France', 'France'],
  'answer_start': [159, 159, 159, 159]}}

In [14]:
val_data[0].keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'example_id'])

In [15]:
data_processor.tokenizer.decode(val_data[0]["input_ids"])

'[CLS] in what country is normandy located? [SEP] the normans ( norman : nourmands ; french : normands ; latin : normanni ) were the people who in the 10th and 11th centuries gave their name to normandy, a region in france. they were descended from norse ( " norman " comes from " norseman " ) raiders and pirates from denmark, iceland and norway who, under their leader rollo, agreed to swear fealty to king charles iii of west francia. through generations of assimilation and mixing with the native frankish and roman - gaulish populations, their descendants would gradually merge with the carolingian - based cultures of west francia. the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [12]:
raw_val_dataset[3]

{'id': '56ddde6b9a695914005b962b',
 'title': 'Normans',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'Who was the Norse leader?',
 'answers': {'text': ['Rollo', 'Rollo', 'Rollo', 'Rollo'],
  'answer_start': [308, 308, 308, 308]}}

In [13]:
raw_val_dataset[2]

{'id': '56ddde6b9a695914005b962a',
 'title': 'Normans',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'From which countries did the Norse originate?',
 'answers': {'text': ['Denmark, Iceland and Norway',
   'Denmark, Iceland and Norway',
   'Denmark, Iceland and Norway',
   