In [2]:
import numpy as np
from tqdm.auto import tqdm
import collections
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from data_utils import SquadDataProcessor
from config import QAConfig
import evaluate

In [3]:
cfg = QAConfig()
cfg.MODEL_NAME = "gpt2-medium"
cfg

QAConfig(MODEL_NAME='gpt2-medium', MAX_LENGTH=384, STRIDE=128, DATASET_NAME='squad_v2', NUM_PROC=8, device='cuda', learning_rate=5e-05, weight_decay=0.01, train_batch_size=16, fp16=True, eval_batch_size=16, num_train_epochs=3, save_total_limit=1, eval_steps=1000, ckpt_dir='distilbert-finetuned-squadv2', N_BEST=20, MAX_ANS_LENGTH=30)

In [4]:
# init model
model = AutoModelForQuestionAnswering.from_pretrained(cfg.MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_NAME)

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
raw_train_dataset = load_dataset(cfg.DATASET_NAME, split ="train[:5%]", num_proc=cfg.NUM_PROC)

In [7]:
raw_train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 6516
})

In [9]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2-medium', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [10]:
examples = raw_train_dataset[:5]
examples

{'id': ['56be85543aeaaa14008c9063',
  '56be85543aeaaa14008c9065',
  '56be85543aeaaa14008c9066',
  '56bf6b0f3aeaaa14008c9601',
  '56bf6b0f3aeaaa14008c9602'],
 'title': ['Beyoncé', 'Beyoncé', 'Beyoncé', 'Beyoncé', 'Beyoncé'],
 'context': ['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (

In [14]:
inputs = tokenizer(
    examples["question"],
    examples["context"],
    max_length=cfg.MAX_LENGTH,
    truncation="only_second",
    stride=cfg.STRIDE,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

In [15]:
inputs[0]

Encoding(num_tokens=182, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [16]:
inputs[1]

Encoding(num_tokens=187, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])