In [38]:
import numpy as np

from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

from sklearn.metrics import f1_score


In [39]:
# Inspect a dataset without downloading it
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("google-research-datasets/tydiqa", "primary_task")
ds_builder.info

DatasetInfo(description='', citation='', homepage='', license='', features={'passage_answer_candidates': {'plaintext_start_byte': List(Value('int32')), 'plaintext_end_byte': List(Value('int32'))}, 'question_text': Value('string'), 'document_title': Value('string'), 'language': Value('string'), 'annotations': {'passage_answer_candidate_index': List(Value('int32')), 'minimal_answers_start_byte': List(Value('int32')), 'minimal_answers_end_byte': List(Value('int32')), 'yes_no_answer': List(Value('string'))}, 'document_plaintext': Value('string'), 'document_url': Value('string')}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='tydiqa', config_name='primary_task', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=5552349645, num_examples=166916, shard_lengths=[15910, 15910, 15910, 15910, 15910, 15910, 14910, 15819, 14909, 14909, 10909], dataset_name='tydiqa'), 'validation': SplitInfo(name='validation', num_bytes=484565021, num_examples=18670,

In [40]:
# load dataset
from datasets import load_dataset
tydiqa_data = load_dataset("google-research-datasets/tydiqa", 'primary_task')
tydiqa_data

DatasetDict({
    train: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 166916
    })
    validation: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 18670
    })
})

In [41]:
idx = 30

# start index
start_index = tydiqa_data['train'][idx]['annotations']['minimal_answers_start_byte'][0]

# end index
end_index = tydiqa_data['train'][idx]['annotations']['minimal_answers_end_byte'][0]

print(f"Question: {tydiqa_data['train'][idx]['question_text']}")
print(f"\nContext (truncated): {tydiqa_data['train'][idx]['document_plaintext'][0:512]} ...")
print(f"\nAnswer: {tydiqa_data['train'][idx]['document_plaintext'][start_index:end_index]}")

Question: Mit√§ on altruismi?

Context (truncated): 


Altruismi ([1],  ‚Äùtoinen‚Äù[2]) tarkoittaa ep√§itsek√§st√§ ja pyyteet√∂nt√§[3] [4] toimintaa, jossa toisen hyv√§ asetetaan oman edun edelle.[5] Altruismin vastakohta on egoismi.[6] Termin esitti ranskalainen filosofi Auguste Comte vuonna 1851, jolloin h√§n m√§√§ritteli altruismin uhrautumiseksi muiden eduksi.[1]
Etiikka
Etiikassa altruismi on oppi, jonka mukaan teon moraalisuus m√§√§ritell√§√§n sen mukaan, tuottaako se hyv√§√§ muille. Altruismi on egoismin vastakohta. Altruismi ei sin√§ns√§ m√§√§rittele sit√§, millainen teko  ...

Answer: tsek√§st√§ ja pyyteet√∂nt√§[3] [4] toimintaa, jossa toisen hyv√§ asetetaan oman edun edelle.[5] Altru


In [42]:
#¬†Flattening the datasets
flattened_train_data = tydiqa_data['train'].flatten()
flattened_test_data =  tydiqa_data['validation'].flatten()
flattened_train_data

Dataset({
    features: ['passage_answer_candidates.plaintext_start_byte', 'passage_answer_candidates.plaintext_end_byte', 'question_text', 'document_title', 'language', 'annotations.passage_answer_candidate_index', 'annotations.minimal_answers_start_byte', 'annotations.minimal_answers_end_byte', 'annotations.yes_no_answer', 'document_plaintext', 'document_url'],
    num_rows: 166916
})

In [43]:
# Selecting a subset of the train dataset and test dataset
flattened_train_data = flattened_train_data.select(range(3000))
flattened_test_data = flattened_test_data.select(range(3000))

In [44]:
# Import the AutoTokenizer from the transformers library
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

# Define max length of sequences in the tokenizer
tokenizer.model_max_length = 512

In [45]:
# Given the characteristics of the dataset and the question-answering task, you will need to add some steps to pre-process the data after the tokenization:

# When there is no answer to a question given a context, you will use the CLS token, a unique token used to represent the start of the sequence.
# Tokenizers can split a given string into substrings, resulting in a subtoken for each substring, creating misalignment between the list of dataset tags and the labels generated by the tokenizer. Therefore, you will need to align the start and end indices with the tokens associated with the target answer word.
# Finally, a tokenizer can truncate a very long sequence. So, if the start/end position of an answer is None, you will assume that it was truncated and assign the maximum length of the tokenizer to those positions.
def process_samples(sample):
    # two-sentence (or two-segment) encoding used by models like BERT
    try:
        tokenized_data = tokenizer(sample['question_text'], sample['document_plaintext'], truncation=True, padding="max_length")
    except Exception as e:
        # Optionally, log the error and sample for debugging
        print(f"Skipping sample due to error: {e}")
        return None

    # [CLS] document tokens ... [SEP] question tokens ... [SEP]
    input_ids = tokenized_data["input_ids"]

    # We will label impossible answers with the index of the CLS token.
    # Should be 0
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # If no answers are given, set the cls_index as answer.
    if sample["annotations.minimal_answers_start_byte"][0] == -1:
        start_position = cls_index
        end_position = cls_index
    else:
        # Start/end character index of the answer in the text.
        gold_text = sample["document_plaintext"][sample['annotations.minimal_answers_start_byte'][0]:sample['annotations.minimal_answers_end_byte'][0]]
        start_char = sample["annotations.minimal_answers_start_byte"][0]
        end_char = sample['annotations.minimal_answers_end_byte'][0] #start_char + len(gold_text)

        # sometimes answers are off by a character or two ‚Äì fix this
        if sample['document_plaintext'][start_char-1:end_char-1] == gold_text:
            start_char = start_char - 1
            end_char = end_char - 1     # When the gold label is off by one character
        elif sample['document_plaintext'][start_char-2:end_char-2] == gold_text:
            start_char = start_char - 2
            end_char = end_char - 2     # When the gold label is off by two characters

        # char_to_token(char_index) map a character position in the original text to the corresponding token index in the encoded sequence
        start_token = tokenized_data.char_to_token(start_char)
        end_token = tokenized_data.char_to_token(end_char - 1)

        # if start position is None, the answer passage has been truncated
        # Get None if the character is inside a part that gets removed or not tokenized (e.g., whitespace or special characters depending on tokenizer)
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        start_position = start_token
        end_position = end_token

    return {'input_ids': tokenized_data['input_ids'],
          'attention_mask': tokenized_data['attention_mask'],
          'start_positions': start_position,
          'end_positions': end_position}

In [46]:
# Tokenizing and processing the flattened dataset
# Apply a function to all the examples in the table (individually or in batches) and update the table by adding new columns.
# If your function returns a column that already exists, then it overwrites it.
processed_train_data = flattened_train_data.map(process_samples)
processed_test_data = flattened_test_data.map(process_samples)
processed_train_data

Dataset({
    features: ['passage_answer_candidates.plaintext_start_byte', 'passage_answer_candidates.plaintext_end_byte', 'question_text', 'document_title', 'language', 'annotations.passage_answer_candidate_index', 'annotations.minimal_answers_start_byte', 'annotations.minimal_answers_end_byte', 'annotations.yes_no_answer', 'document_plaintext', 'document_url', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 3000
})

In [47]:
# Import the AutoModelForQuestionAnswering for the pre-trained model. You will only fine tune the head of the model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
print(model)

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
     

In [48]:
columns_to_return = ['input_ids','attention_mask', 'start_positions', 'end_positions']

# Set the format of the datasets to PyTorch tensors
processed_train_data.set_format(type='torch', columns=columns_to_return)
processed_test_data.set_format(type='torch', columns=columns_to_return)

In [49]:
def compute_f1_metrics(pred):
    # for extractive QA, the pred.predictions is a tuple of (start_logits, end_logits)
    # start_logits shape: (sample size, sequence_length)
    # end_logits shape: (sample size, sequence_length) 
    # the pred.label_ids is also a tuple of (start_positions, end_positions)
    # start_positions shape: (sample size, )
    # end_positions shape: (sample size, )
    start_logits, end_logits = pred.predictions
    start_labels, end_labels = pred.label_ids

    # ÊâìÂç∞Á±ªÂûãÂíåÂΩ¢Áä∂ÔºåÂ∏ÆÂä©Ë∞ÉËØï
    print(f"È¢ÑÊµãÂÄºÁ±ªÂûã: {type(start_logits)}, ÂΩ¢Áä∂: {start_logits.shape}") # (3000, 512)
    print(f"ÁúüÂÆûÊ†áÁ≠æÁ±ªÂûã: {type(start_labels)}ÔºåÂΩ¢Áä∂: {start_labels.shape}") # (3000, 512)

    start_preds = start_logits.argmax(-1)
    end_preds = end_logits.argmax(-1)

    f1_start = f1_score(start_labels, start_preds, average='macro')
    f1_end = f1_score(end_labels, end_preds, average='macro')

    return {
        'f1_start': f1_start,
        'f1_end': f1_end,
    }

In [50]:
# Training hyperparameters
training_args = TrainingArguments(
    output_dir='model_results',     # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_ratio=0.1,
    weight_decay=0.01,               # strength of weight decay
    logging_steps=50,
    learning_rate=2e-5
)

# Trainer object
trainer = Trainer(
    model=model,                        # the instantiated ü§ó Transformers model to be trained
    args=training_args,                 # training arguments, defined above
    train_dataset=processed_train_data, # training dataset
    eval_dataset=processed_test_data,   # evaluation dataset
    compute_metrics=compute_f1_metrics
)

# Training loop
trainer.train(resume_from_checkpoint=False)



Step,Training Loss
50,0.4932
100,0.1591
150,0.2511
200,0.143
250,0.1399
300,0.1291
350,0.1339
400,0.1365
450,0.0867
500,0.1215


TrainOutput(global_step=564, training_loss=0.1720068002423496, metrics={'train_runtime': 1411.5479, 'train_samples_per_second': 6.376, 'train_steps_per_second': 0.4, 'total_flos': 1175877900288000.0, 'train_loss': 0.1720068002423496, 'epoch': 3.0})

In [51]:
trainer.evaluate(processed_test_data)



È¢ÑÊµãÂÄºÁ±ªÂûã: <class 'numpy.ndarray'>, ÂΩ¢Áä∂: (3000, 512)
ÁúüÂÆûÊ†áÁ≠æÁ±ªÂûã: <class 'numpy.ndarray'>ÔºåÂΩ¢Áä∂: (3000,)


{'eval_loss': nan,
 'eval_f1_start': 0.04628814728479341,
 'eval_f1_end': 0.0393449251920744,
 'eval_runtime': 131.7309,
 'eval_samples_per_second': 22.774,
 'eval_steps_per_second': 2.847,
 'epoch': 3.0}

In [52]:
text = r"""
The Golden Age of Comic Books describes an era of American comic books from the
late 1930s to circa 1950. During this time, modern comic books were first published
and rapidly increased in popularity. The superhero archetype was created and many
well-known characters were introduced, including Superman, Batman, Captain Marvel
(later known as SHAZAM!), Captain America, and Wonder Woman.
Between 1939 and 1941 Detective Comics and its sister company, All-American Publications,
introduced popular superheroes such as Batman and Robin, Wonder Woman, the Flash,
Green Lantern, Doctor Fate, the Atom, Hawkman, Green Arrow and Aquaman.[7] Timely Comics,
the 1940s predecessor of Marvel Comics, had million-selling titles featuring the Human Torch,
the Sub-Mariner, and Captain America.[8]
As comic books grew in popularity, publishers began launching titles that expanded
into a variety of genres. Dell Comics' non-superhero characters (particularly the
licensed Walt Disney animated-character comics) outsold the superhero comics of the day.[12]
The publisher featured licensed movie and literary characters such as Mickey Mouse, Donald Duck,
Roy Rogers and Tarzan.[13] It was during this era that noted Donald Duck writer-artist
Carl Barks rose to prominence.[14] Additionally, MLJ's introduction of Archie Andrews
in Pep Comics #22 (December 1941) gave rise to teen humor comics,[15] with the Archie
Andrews character remaining in print well into the 21st century.[16]
At the same time in Canada, American comic books were prohibited importation under
the War Exchange Conservation Act[17] which restricted the importation of non-essential
goods. As a result, a domestic publishing industry flourished during the duration
of the war which were collectively informally called the Canadian Whites.
The educational comic book Dagwood Splits the Atom used characters from the comic
strip Blondie.[18] According to historian Michael A. Amundson, appealing comic-book
characters helped ease young readers' fear of nuclear war and neutralize anxiety
about the questions posed by atomic power.[19] It was during this period that long-running
humor comics debuted, including EC's Mad and Carl Barks' Uncle Scrooge in Dell's Four
Color Comics (both in 1952).[20][21]
"""

questions = ["What superheroes were introduced between 1939 and 1941 by Detective Comics and its sister company?",
             "What comic book characters were created between 1939 and 1941?",
             "What well-known characters were created between 1939 and 1941?",
             "What well-known superheroes were introduced between 1939 and 1941 by Detective Comics?"]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, return_tensors="pt") # same as: inputs = tokenizer(question, text, return_tensor="pt")
    # inputs: {"input_ids": [[101, 1327, ...]], "attention_mask": [[1, 1, ...]]}
    input_ids = inputs["input_ids"].tolist()[0]
    inputs.to("mps") # Use "cuda" for GPU or "mps" for Apple Silicon

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids) # text_tokens: ['[CLS]', 'What', 'superhero', '##es', 'were', ..., '[SEP]', 'The', 'Golden', 'Age', ..., '[SEP]']
    answer_model = model(**inputs)
    
    start_logits = answer_model['start_logits'].cpu().detach().numpy()

    answer_start = np.argmax(start_logits)  
    
    end_logits = answer_model['end_logits'].cpu().detach().numpy()
    
    # Get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(end_logits) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")


Question: What superheroes were introduced between 1939 and 1941 by Detective Comics and its sister company?
Answer: [CLS]

Question: What comic book characters were created between 1939 and 1941?
Answer: [CLS]

Question: What well-known characters were created between 1939 and 1941?
Answer: [CLS]

Question: What well-known superheroes were introduced between 1939 and 1941 by Detective Comics?
Answer: [CLS]

