In [41]:
from huggingface_hub import notebook_login
notebook_login ()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
import numpy as np
from tqdm.auto import tqdm
import collections
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
import evaluate

device = torch.device("cuda:1")

# setup config
MODEL_NAME = "distilbert-base-uncased"
# document max tokens
MAX_LENGTH = 384 
STRIDE = 128 

# setup Dataset
DATASET_NAME = "squad_v2"
raw_dataset = load_dataset(DATASET_NAME)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [12]:
example = raw_dataset['train'][0]
example

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [15]:
# Tokenize inputs
sample_input = tokenizer(
    text=example["question"],
    text_pair=example["context"],
    max_length=MAX_LENGTH,
    truncation="only_second",
    stride=STRIDE,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
)
sample_input

{'input_ids': [[101, 2043, 2106, 20773, 2707, 3352, 2759, 1029, 102, 20773, 21025, 19358, 22815, 1011, 5708, 1006, 1013, 12170, 23432, 29715, 3501, 29678, 12325, 29685, 1013, 10506, 1011, 10930, 2078, 1011, 2360, 1007, 1006, 2141, 2244, 1018, 1010, 3261, 1007, 2003, 2019, 2137, 3220, 1010, 6009, 1010, 2501, 3135, 1998, 3883, 1012, 2141, 1998, 2992, 1999, 5395, 1010, 3146, 1010, 2016, 2864, 1999, 2536, 4823, 1998, 5613, 6479, 2004, 1037, 2775, 1010, 1998, 3123, 2000, 4476, 1999, 1996, 2397, 4134, 2004, 2599, 3220, 1997, 1054, 1004, 1038, 2611, 1011, 2177, 10461, 1005, 1055, 2775, 1012, 3266, 2011, 2014, 2269, 1010, 25436, 22815, 1010, 1996, 2177, 2150, 2028, 1997, 1996, 2088, 1005, 1055, 2190, 1011, 4855, 2611, 2967, 1997, 2035, 2051, 1012, 2037, 14221, 2387, 1996, 2713, 1997, 20773, 1005, 1055, 2834, 2201, 1010, 20754, 1999, 2293, 1006, 2494, 1007, 1010, 2029, 2511, 2014, 2004, 1037, 3948, 3063, 4969, 1010, 3687, 2274, 8922, 2982, 1998, 2956, 1996, 4908, 2980, 2531, 2193, 1011, 2028, 3

In [18]:
len(sample_input.input_ids[0])

384

In [4]:
def preprocess_training_examples(examples):
    '''
        preprocess training data per batch
    '''
    # Preprocess batch questions
    questions = [q.strip() for q in examples["question"]]

    # Tokenize inputs
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Extract offset mappings and remove from inputs
    offset_mapping = inputs.pop("offset_mapping")

    # Extract sample mapping and remove from inputs
    sample_map = inputs.pop("overflow_to_sample_mapping")

    # Extract answers
    answers = examples["answers"]

    # Initialize start and end positions
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        sequence_ids = inputs.sequence_ids(i)

        # Find context start and end
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Get answer
        answer = answers[sample_idx]

        if len(answer['text']) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Get start and end character positions
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])

            # Check if answer spans are within context
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Find start position
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)
    
                # Find end position
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    # Update inputs with start and end positions
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [21]:
train_dataset = raw_dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns = raw_dataset["train"].column_names,
)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [22]:
sample = next(iter(train_dataset))
sample.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [23]:
len(sample['input_ids']), sample['start_positions'], sample['end_positions']

(384, 75, 78)

In [24]:
sample['start_positions']

75

In [31]:
def preprocess_validation_examples(examples):
    '''
        preprocess validation data per batch
    '''
    # Preprocess batch questions
    questions = [q.strip() for q in examples["question"]]

    # Tokenize inputs
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Extract offset mappings and remove from inputs
    # offset_mapping = inputs.pop("offset_mapping")
    example_ids = []

    # Extract sample mapping and remove from inputs
    sample_map = inputs.pop("overflow_to_sample_mapping")

    # Modify answer offset
    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        
        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # remove unuse offset
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None \
                for k, o in enumerate(offset)
        ]

    # Update inputs with start and end positions
    inputs["example_id"] = example_ids
    return inputs

In [32]:
validation_dataset = raw_dataset["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns = raw_dataset["validation"].column_names
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [33]:
sample = next(iter(validation_dataset))
sample.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'example_id'])

In [37]:
len(sample['input_ids']), len(sample['offset_mapping']), len(sample['example_id'])

(384, 384, 24)

In [38]:
len(raw_dataset['validation']), len(validation_dataset)

(11873, 12134)

In [39]:
len(raw_dataset['train']), len(train_dataset)

(130319, 131754)

In [40]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
args = TrainingArguments(
    output_dir="distilbert-finetuned-squadv2",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False
)

In [46]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [47]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.




Step,Training Loss
500,1.9959
1000,0.0
1500,0.0
2000,0.0
2500,0.0
3000,0.0
3500,0.0
4000,0.0
4500,0.0
5000,0.0


KeyboardInterrupt: 