In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard


# Project 2 - Part 2
For Part 2, attempt to fine-tune the question-answering with your questions and answers intended to offer greater resolution than the answers in Part 1.
    To do this, you will need to generate your own custom SQuAD-compatible QA dataset using the sections of your choice, the questions, and the answers which you will add to the squad dataset downloadable from Hugging Face, [which is described in this article](https://huggingface.co/transformers/v3.2.0/custom_datasets.html).  Note: custom training can take a very long time on Google Colab, so be prepared for that.

https://huggingface.co/docs/transformers/tasks/question_answering

In [3]:
# import os
# import spacy
# import json
# import datetime

from transformers import DistilBertTokenizerFast, TFDistilBertForQuestionAnswering
from datasets import load_dataset
from custom_squad import *
import tensorflow as tf
from transformers import DefaultDataCollator
from transformers import create_optimizer
from keras.callbacks import TensorBoard


In [4]:
# check tensorflow device
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


---
# Import dataset

In [18]:
dataset = load_dataset("squad_v2")
dataset_train = dataset['train']
dataset_val = dataset['validation']

Found cached dataset squad_v2 (/home/hp/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [20]:
dataset['validation'][11872]

{'id': '5ad28ad0d7d075001a4299cf',
 'title': 'Force',
 'context': 'The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.',
 'question': 'What force is part of the modern SI system?',
 'answers': {'text': [], 'answer_start': []}}

---
# Add Custom Questions

In [21]:
dataset['train'] = create_custom_squad(dataset_train)


---
# Setup Dataset

In [22]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


In [23]:
max_length = 500
doc_stride = (128)

def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a
    # stride. This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous
    # feature.
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a
    # map from a feature to its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original
    # context. This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what
        # is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this
        # span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the
            # CLS index).
            if not (
                    offsets[token_start_index][0] <= start_char
                    and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the
                # answer.
                # Note: we could go after the last offset if the answer is the last word (edge
                # case).
                while (
                        token_start_index < len(offsets)
                        and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [24]:
tokenized_dataset = dataset.map(prepare_train_features, batched=True, remove_columns=dataset['train'].column_names)


  0%|          | 0/131 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/hp/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-813c110a33cfee33.arrow


In [25]:
data_collator = DefaultDataCollator(return_tensors="tf")

---
# Setup Model For Finetuning

In [26]:
model_checkpoint = "distilbert-base-uncased"
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")


2022-10-28 17:15:33.780002: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-28 17:15:33.780919: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-28 17:15:33.781016: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-28 17:15:33.781061: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA

In [29]:
model_name = model_checkpoint.split("/")[-1]
push_to_hub_model_id = f"{model_name}-finetuned-squad-holmes"
learning_rate = 2e-5
num_train_epochs = 3
weight_decay = 0.01

In [28]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [30]:
total_train_steps = len(tf_train_set) * num_train_epochs

optimizer, schedule = create_optimizer(
    init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps
)

In [31]:
model.compile(optimizer=optimizer, jit_compile=True, metrics=["accuracy"])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [32]:
model

<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForQuestionAnswering at 0x7fd8e7351d30>

In [33]:
# from transformers.keras_callbacks import PushToHubCallback
# push_to_hub_callback = PushToHubCallback(
#     output_dir="./qa_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=push_to_hub_model_id,
# )

tensorboard_callback = TensorBoard(log_dir="./logs")

callbacks = [tensorboard_callback]

# callbacks = [tensorboard_callback, push_to_hub_callback]

model.fit(
    tf_train_set,
    validation_data=tf_validation_set,
    epochs=num_train_epochs,
    callbacks=callbacks,
)

Epoch 1/3


2022-10-28 17:15:57.190197: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x55f344a92120 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-10-28 17:15:57.190226: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2022-10-28 17:15:57.278059: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-10-28 17:15:57.284026: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator tf_distil_bert_for_question_answering/distilbert/embeddings/assert_less/Assert/AssertGuard/Assert
2022-10-28 17:16:03.079385: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory

2022-10-28 17:16:05.702525: I tensorflow/compiler/jit/xla_compilation_cache.cc:476] Compiled cluster using XLA!  This l



2022-10-28 17:43:27.064221: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator tf_distil_bert_for_question_answering/distilbert/embeddings/assert_less/Assert/AssertGuard/Assert
2022-10-28 17:44:06.748713: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator tf_distil_bert_for_question_answering/distilbert/embeddings/assert_less/Assert/AssertGuard/Assert


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd8b553a340>

In [34]:
# save the model weights
os.makedirs('model-weights', exist_ok=True)
model.save_pretrained('model-weights')


---
#  Testing

In [71]:
# load the finetuned weights
model = TFDistilBertForQuestionAnswering.from_pretrained('model-weights')

Some layers from the model checkpoint at model-weights were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_59']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at model-weights and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
text = r"""
Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

In [73]:
questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

In [74]:
model.weights

[<tf.Variable 'tf_distil_bert_for_question_answering_3/distilbert/embeddings/word_embeddings/weight:0' shape=(30522, 768) dtype=float32, numpy=
 array([[-0.0109161 , -0.0654835 , -0.02312338, ..., -0.02070241,
         -0.05658108, -0.03640074],
        [-0.01319846, -0.06733431, -0.01605646, ..., -0.0226614 ,
         -0.05537301, -0.02600443],
        [-0.01759106, -0.07094341, -0.01443494, ..., -0.02457913,
         -0.05956192, -0.0231829 ],
        ...,
        [-0.0231029 , -0.05878259, -0.01048967, ..., -0.01945743,
         -0.02615411, -0.02118432],
        [-0.0490171 , -0.05614787, -0.00465348, ..., -0.01065376,
         -0.01797333, -0.02187675],
        [-0.00646111, -0.0914881 , -0.00254872, ..., -0.01505679,
         -0.05040044,  0.04597744]], dtype=float32)>,
 <tf.Variable 'tf_distil_bert_for_question_answering_3/distilbert/embeddings/position_embeddings/embeddings:0' shape=(512, 768) dtype=float32, numpy=
 array([[ 0.01705299, -0.02776254, -0.0329924 , ...,  0.0010424

In [75]:
for question in questions:
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
    input_ids = inputs["input_ids"].numpy()[0]
    print(input_ids)
    outputs = model(inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    print(answer_start_scores)
    # Get the most likely beginning of answer with the argmax of the score
    answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
    # Get the most likely end of answer with the argmax of the score
    answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
    )

    print(f"Question: {question}")
    print(f"Answer: {answer}")


[  101  2129  2116  3653 23654  2098  4275  2024  2800  1999 19081  1029
   102 19081  1006  3839  2124  2004  1052 22123  2953  2818  1011 19081
  1998  1052 22123  2953  2818  1011  3653 23654  2098  1011 14324  1007
  3640  2236  1011  3800  4294  2015  1006 14324  1010 14246  2102  1011
  1016  1010 23455  1010 28712  2213  1010  4487 16643 23373  1010 28712
  7159  1529  1007  2005  3019  2653  4824  1006 17953  2226  1007  1998
  3019  2653  4245  1006 17953  2290  1007  2007  2058  3590  1009  3653
 23654  2098  4275  1999  2531  1009  4155  1998  2784  6970 25918  8010
  2090 23435 12314  1016  1012  1014  1998  1052 22123  2953  2818  1012
   102]
tf.Tensor(
[[ 2.419729  -5.7813487 -6.176385  -6.2736783 -7.9876127 -8.414704
  -7.672824  -8.134203  -8.013931  -7.812468  -6.877867  -8.435065
  -8.2939625 -3.0207381 -6.255234  -6.6554046 -7.4686213 -7.506925
  -5.080403  -7.7499695 -8.065927  -7.6190834 -7.1391015 -6.3464413
  -7.8499775 -5.5065365 -7.8778925 -7.9771404 -7.660795

---
# Free up some memory

In [None]:
# del tokenizer, nlp, model