In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np

from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

from sklearn.metrics import f1_score

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
path = "./Lab_support/Lab3_data/tydiqa_data/"
tydiqa_data = load_from_disk(path)
tydiqa_data

DatasetDict({
    train: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 9211
    })
    validation: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 1031
    })
})

In [3]:
type(tydiqa_data['train'])

datasets.arrow_dataset.Dataset

In [4]:
tydiqa_data['train']

Dataset({
    features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
    num_rows: 9211
})

In [5]:
idx = 600

# start index
start_index = tydiqa_data['train'][idx]['annotations']['minimal_answers_start_byte'][0]

# end index
end_index = tydiqa_data['train'][idx]['annotations']['minimal_answers_end_byte'][0]

print(f"Question: {tydiqa_data['train'][idx]['question_text']}")
print(f"\nContext: {tydiqa_data['train'][idx]['document_plaintext'][0:512]} ...")
print(f"\nAnswer: {tydiqa_data['train'][idx]['document_plaintext'][start_index:end_index]}")

Question: What mental effects can a mother experience after childbirth?

Context: 

Postpartum depression (PPD), also called postnatal depression, is a type of mood disorder associated with childbirth, which can affect both sexes.[1][3] Symptoms may include extreme sadness, low energy, anxiety, crying episodes, irritability, and changes in sleeping or eating patterns.[1] Onset is typically between one week and one month following childbirth.[1] PPD can also negatively affect the newborn child.[2]

While the exact cause of PPD is unclear, the cause is believed to be a combination of physi ...

Answer: Postpartum depression (PPD)


The question answering model predicts a start and endpoint in the context to extract as the answer. That's why this NLP task is known as extractive question answering.

To train the model, need to pass start and endpoints as labels &rarr; need to implement a function that extracts the start and end positions from the dataset

The dataset contains unanswerable questions. For these, the start and end indices for the answer are equal to -1.

In [6]:
tydiqa_data['train'][0]['annotations']

{'passage_answer_candidate_index': [-1],
 'minimal_answers_start_byte': [-1],
 'minimal_answers_end_byte': [-1],
 'yes_no_answer': ['NONE']}

In [7]:
# Flattening the datasets
flattened_train_data = tydiqa_data['train'].flatten()
flattened_test_data =  tydiqa_data['validation'].flatten()

In [8]:
# Selecting a subset of the train dataset
flattened_train_data = flattened_train_data.select(range(3000))

# Selecting a subset of the test dataset
flattened_test_data = flattened_test_data.select(range(1000))

### Tokenizers (from HuggingFace)

In [9]:
# Import the AutoTokenizer from the transformers library
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

# Define max length of sequences in the tokenizer
tokenizer.model_max_length = 512

In [10]:
def process_samples(sample):
    tokenized_data = tokenizer(sample['document_plaintext'], sample['question_text'], truncation="only_first", padding="max_length")

    input_ids = tokenized_data["input_ids"]

    # We will label impossible answers with the index of the CLS token.
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # If no answers are given, set the cls_index as answer.
    if sample["annotations.minimal_answers_start_byte"][0] == -1:
        start_position = cls_index
        end_position = cls_index
    else:
        # Start/end character index of the answer in the text.
        gold_text = sample["document_plaintext"][sample['annotations.minimal_answers_start_byte'][0]:sample['annotations.minimal_answers_end_byte'][0]]
        start_char = sample["annotations.minimal_answers_start_byte"][0]
        end_char = sample['annotations.minimal_answers_end_byte'][0] #start_char + len(gold_text)

        # sometimes answers are off by a character or two – fix this
        if sample['document_plaintext'][start_char-1:end_char-1] == gold_text:
            start_char = start_char - 1
            end_char = end_char - 1     # When the gold label is off by one character
        elif sample['document_plaintext'][start_char-2:end_char-2] == gold_text:
            start_char = start_char - 2
            end_char = end_char - 2     # When the gold label is off by two characters

        start_token = tokenized_data.char_to_token(start_char)
        end_token = tokenized_data.char_to_token(end_char - 1)

        # if start position is None, the answer passage has been truncated
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        start_position = start_token
        end_position = end_token

    return {'input_ids': tokenized_data['input_ids'],
          'attention_mask': tokenized_data['attention_mask'],
          'start_positions': start_position,
          'end_positions': end_position}

In [11]:
# Tokenizing and processing the flattened dataset
processed_train_data = flattened_train_data.map(process_samples)
processed_test_data = flattened_test_data.map(process_samples)

### Transformers

In [12]:
# Import the AutoModelForQuestionAnswering for the pre-trained model. You will only fine tune the head of the model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

In [13]:
columns_to_return = ['input_ids','attention_mask', 'start_positions', 'end_positions']

processed_train_data.set_format(type='pt', columns=columns_to_return)
processed_test_data.set_format(type='pt', columns=columns_to_return)

In [14]:
def compute_f1_metrics(pred):
    start_labels = pred.label_ids[0]
    start_preds = pred.predictions[0].argmax(-1)
    end_labels = pred.label_ids[1]
    end_preds = pred.predictions[1].argmax(-1)

    f1_start = f1_score(start_labels, start_preds, average='macro')
    f1_end = f1_score(end_labels, end_preds, average='macro')

    return {
        'f1_start': f1_start,
        'f1_end': f1_end,
    }

Using the HuggingFace Trainer:

In [15]:
# Training hyperparameters
training_args = TrainingArguments(
    output_dir='Lab3_model_results', # output directory
    overwrite_output_dir=True,       # latest results each time code is run
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=20,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_steps=50
)

# Trainer object
trainer = Trainer(
    model=model,                        # the instantiated 🤗 Transformers model to be trained
    args=training_args,                 # training arguments, defined above
    train_dataset=processed_train_data, # training dataset
    eval_dataset=processed_test_data,   # evaluation dataset
    compute_metrics=compute_f1_metrics
)

# Training loop
trainer.train()

  0%|          | 0/1125 [00:00<?, ?it/s]

{'loss': 2.2187, 'learning_rate': 4.8642533936651585e-05, 'epoch': 0.13}
{'loss': 1.8195, 'learning_rate': 4.638009049773756e-05, 'epoch': 0.27}
{'loss': 1.9633, 'learning_rate': 4.411764705882353e-05, 'epoch': 0.4}
{'loss': 1.8637, 'learning_rate': 4.1855203619909506e-05, 'epoch': 0.53}
{'loss': 1.7545, 'learning_rate': 3.959276018099547e-05, 'epoch': 0.67}
{'loss': 1.596, 'learning_rate': 3.733031674208145e-05, 'epoch': 0.8}
{'loss': 1.5088, 'learning_rate': 3.506787330316742e-05, 'epoch': 0.93}
{'loss': 1.4219, 'learning_rate': 3.2805429864253393e-05, 'epoch': 1.07}
{'loss': 1.1453, 'learning_rate': 3.0542986425339374e-05, 'epoch': 1.2}


Checkpoint destination directory Lab3_model_results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 1.2377, 'learning_rate': 2.828054298642534e-05, 'epoch': 1.33}
{'loss': 1.1648, 'learning_rate': 2.6018099547511314e-05, 'epoch': 1.47}
{'loss': 1.2767, 'learning_rate': 2.3755656108597284e-05, 'epoch': 1.6}
{'loss': 1.1996, 'learning_rate': 2.149321266968326e-05, 'epoch': 1.73}
{'loss': 0.9741, 'learning_rate': 1.923076923076923e-05, 'epoch': 1.87}
{'loss': 1.2366, 'learning_rate': 1.6968325791855205e-05, 'epoch': 2.0}
{'loss': 0.7234, 'learning_rate': 1.4705882352941177e-05, 'epoch': 2.13}
{'loss': 0.7281, 'learning_rate': 1.244343891402715e-05, 'epoch': 2.27}
{'loss': 0.7777, 'learning_rate': 1.0180995475113122e-05, 'epoch': 2.4}
{'loss': 0.7048, 'learning_rate': 7.918552036199094e-06, 'epoch': 2.53}


Checkpoint destination directory Lab3_model_results/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.7309, 'learning_rate': 5.656108597285068e-06, 'epoch': 2.67}
{'loss': 0.5173, 'learning_rate': 3.3936651583710405e-06, 'epoch': 2.8}
{'loss': 0.6883, 'learning_rate': 1.1312217194570136e-06, 'epoch': 2.93}
{'train_runtime': 844.1746, 'train_samples_per_second': 10.661, 'train_steps_per_second': 1.333, 'train_loss': 1.2267307739257813, 'epoch': 3.0}


TrainOutput(global_step=1125, training_loss=1.2267307739257813, metrics={'train_runtime': 844.1746, 'train_samples_per_second': 10.661, 'train_steps_per_second': 1.333, 'train_loss': 1.2267307739257813, 'epoch': 3.0})

In [16]:
trainer.evaluate(processed_test_data)

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.2122082710266113,
 'eval_f1_start': 0.11509530309530308,
 'eval_f1_end': 0.12050932345681514,
 'eval_runtime': 28.8599,
 'eval_samples_per_second': 34.65,
 'eval_steps_per_second': 4.331,
 'epoch': 3.0}

### Using the fine-tuned model

In [17]:
text = r"""
The Golden Age of Comic Books describes an era of American comic books from the
late 1930s to circa 1950. During this time, modern comic books were first published
and rapidly increased in popularity. The superhero archetype was created and many
well-known characters were introduced, including Superman, Batman, Captain Marvel
(later known as SHAZAM!), Captain America, and Wonder Woman.
Between 1939 and 1941 Detective Comics and its sister company, All-American Publications,
introduced popular superheroes such as Batman and Robin, Wonder Woman, the Flash,
Green Lantern, Doctor Fate, the Atom, Hawkman, Green Arrow and Aquaman.[7] Timely Comics,
the 1940s predecessor of Marvel Comics, had million-selling titles featuring the Human Torch,
the Sub-Mariner, and Captain America.[8]
As comic books grew in popularity, publishers began launching titles that expanded
into a variety of genres. Dell Comics' non-superhero characters (particularly the
licensed Walt Disney animated-character comics) outsold the superhero comics of the day.[12]
The publisher featured licensed movie and literary characters such as Mickey Mouse, Donald Duck,
Roy Rogers and Tarzan.[13] It was during this era that noted Donald Duck writer-artist
Carl Barks rose to prominence.[14] Additionally, MLJ's introduction of Archie Andrews
in Pep Comics #22 (December 1941) gave rise to teen humor comics,[15] with the Archie
Andrews character remaining in print well into the 21st century.[16]
At the same time in Canada, American comic books were prohibited importation under
the War Exchange Conservation Act[17] which restricted the importation of non-essential
goods. As a result, a domestic publishing industry flourished during the duration
of the war which were collectively informally called the Canadian Whites.
The educational comic book Dagwood Splits the Atom used characters from the comic
strip Blondie.[18] According to historian Michael A. Amundson, appealing comic-book
characters helped ease young readers' fear of nuclear war and neutralize anxiety
about the questions posed by atomic power.[19] It was during this period that long-running
humor comics debuted, including EC's Mad and Carl Barks' Uncle Scrooge in Dell's Four
Color Comics (both in 1952).[20][21]
"""

questions = ["What superheroes were introduced between 1939 and 1941 by Detective Comics and its sister company?",
             "What comic book characters were created between 1939 and 1941?",
             "What well-known characters were created between 1939 and 1941?",
             "What well-known superheroes were introduced between 1939 and 1941 by Detective Comics?"]

In [18]:
import torch

for question in questions:
    inputs = tokenizer.encode_plus(question, text, return_tensors="pt")

    input_ids = inputs["input_ids"].tolist()[0]
    inputs.to(torch.device("mps"))

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_model = model(**inputs)
    
    start_logits = answer_model['start_logits'].cpu().detach().numpy()

    answer_start = np.argmax(start_logits)  
    
    end_logits = answer_model['end_logits'].cpu().detach().numpy()
    
    # Get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(end_logits) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: What superheroes were introduced between 1939 and 1941 by Detective Comics and its sister company?
Answer: Superman, Batman, Captain Marvel ( later known as SHAZAM! ), Captain America, and Wonder Woman. Between 1939 and 1941 Detective Comics and its sister company, All - American Publications, introduced popular superheroes such as Batman and Robin, Wonder Woman, the Flash, Green Lantern, Doctor Fate, the Atom, Hawkman, Green Arrow and Aquaman

Question: What comic book characters were created between 1939 and 1941?
Answer: Superman, Batman, Captain Marvel ( later known as SHAZAM! ), Captain America, and Wonder Woman

Question: What well-known characters were created between 1939 and 1941?
Answer: Superman, Batman, Captain Marvel ( later known as SHAZAM! ), Captain America, and Wonder Woman

Question: What well-known superheroes were introduced between 1939 and 1941 by Detective Comics?
Answer: Superman, Batman, Captain Marvel ( later known as SHAZAM! ), Captain America, and 