In [3]:
import os

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
main_dir = "/content/gdrive/MyDrive/QADistilBert"
os.chdir(main_dir)
os.listdir()

['old',
 'dataset',
 'codes',
 'results',
 'train_squad_dataset.json',
 'valid_squad_dataset.json',
 'model',
 'wandb']

In [5]:
"""https://huggingface.co/deepset/roberta-base-squad2"""
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "distilbert-base-uncased-distilled-squad"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
res # score == F1 Score: it's computed on the individual words in the prediction vs the true words provided in context

{'score': 0.4187259376049042,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}

In [7]:
model

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

\\# How to fine-tune a QA Model?

In [9]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [10]:
from datasets import load_dataset
# More documentation about the dataset can be found here: https://huggingface.co/datasets/viewer/?dataset=squad
# This is essentially a wrapper for the segmented data.
# The SQuAD dataset is a popular dataset based on wikipedia articles where there is an answer in the context provided.
# (different from SQuAD2.0)
squad = load_dataset("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [11]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [12]:
# More specific information about the dataset can be found here: https://huggingface.co/datasets/squad#data-instances
squad["train"][0]
# id -> hash of the context
# title -> Document where the context resides
# Context -> Information where the answer resides
# Question -> What question are you trying to find the answer to?
# Answers -> What is the answer to the question? And the location on where in the text the answer begins (span)

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [13]:
# Preprocess the data to a BERT format
def preprocess_function(examples):
    """Courtesy of https://huggingface.co/docs/transformers/tasks/question_answering"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [14]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [17]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [21]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name) # remember that model_name is "distilbert-base-uncased-distilled-squad"

In [22]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [26]:
# Let's start training!
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,5.9532,5.950643


TrainOutput(global_step=5475, training_loss=5.962477391909246, metrics={'train_runtime': 3138.6364, 'train_samples_per_second': 27.91, 'train_steps_per_second': 1.744, 'total_flos': 8583810682277376.0, 'train_loss': 5.962477391909246, 'epoch': 1.0})

cc4856cc44f1dae7844c269adf899daa328e7a14

*   List item
*   List item



In [27]:
trainer.save_model("model/save/customTrained_Distilbert_Squad")
trainer.evaluate()

{'eval_loss': 5.950643062591553,
 'eval_runtime': 118.9393,
 'eval_samples_per_second': 88.869,
 'eval_steps_per_second': 5.557,
 'epoch': 1.0}

In [None]:
# model = BertModel.from_pretrained("model/save/")

In [28]:
model.save_pretrained("model/distilbert-finetuned-squad")
tokenizer.save_pretrained("model/distilbert-finetuned-squad")


('model/distilbert-finetuned-squad/tokenizer_config.json',
 'model/distilbert-finetuned-squad/special_tokens_map.json',
 'model/distilbert-finetuned-squad/vocab.txt',
 'model/distilbert-finetuned-squad/added_tokens.json',
 'model/distilbert-finetuned-squad/tokenizer.json')

In [1]:
import os
import torch
from transformers import pipeline , AutoTokenizer ,AutoModelForQuestionAnswering
from transformers import DistilBertConfig
from transformers import DistilBertForMaskedLM

In [6]:
# Path to your custom-trained model
model_path = "model/save/customTrained_Distilbert_Squad"

# Load the tokenizer and model
bert_model = AutoModelForQuestionAnswering.from_pretrained(model_path)
bert_tokenizer = AutoTokenizer.from_pretrained(model_path)

'''
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
'''

'\ntokenizer = DistilBertTokenizer.from_pretrained(model_path)\nmodel = DistilBertForSequenceClassification.from_pretrained(model_path)\n'

In [7]:
qa_pipeline = pipeline('question-answering', model=model_path,tokenizer=model_path)

In [22]:
context = """
Hugging Face is an open-source provider of natural language processing tools.
It has developed libraries like 'transformers' which allow easy use of state-of-the-art NLP models.
DistilBERT is a smaller, faster version of BERT, created by Hugging Face by applying knowledge distillation.
"""
question = "Who created DistilBERT?"

In [9]:
result = qa_pipeline({
    'question': question,
    'context': context
})

print(result)

{'score': 0.0002601456653792411, 'start': 1, 'end': 8, 'answer': 'Hugging'}


In [13]:
import string
import re
from collections import Counter

# Function to normalize answers (remove articles, punctuation, etc.)
def normalize_answer(s):
    """Lowercase, remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punctuation(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

# Exact Match calculation
def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# F1 Score calculation
def f1_score(prediction, ground_truth):
    pred_tokens = normalize_answer(prediction).split()
    truth_tokens = normalize_answer(ground_truth).split()

    common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
    num_common = sum(common_tokens.values())

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens)
    recall = num_common / len(truth_tokens)

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [14]:
from datasets import load_dataset

# Load the SQuAD dataset
dataset = load_dataset("squad", split="validation[100:400]")  # Load a small subset for testing

# Initialize scores
total_em, total_f1 = 0, 0

# Iterate over the dataset and compute metrics for each example
for example in dataset:
    question = example['question']
    context = example['context']
    ground_truth = example['answers']['text'][0]  # Ground truth answer (first one if multiple)

    # Get model's prediction
    predicted_result = qa_pipeline({
        'question': question,
        'context': context
    })
    predicted_answer = predicted_result['answer']

    # Compute EM and F1 for this example
    em = exact_match_score(predicted_answer, ground_truth)
    f1 = f1_score(predicted_answer, ground_truth)

    total_em += em
    total_f1 += f1

# Average over the dataset
num_examples = len(dataset)
average_em = total_em / num_examples
average_f1 = total_f1 / num_examples

print(f"Average Exact Match: {average_em}")
print(f"Average F1 Score: {average_f1}")

Average Exact Match: 0.0
Average F1 Score: 0.006666666666666667


In [15]:
def EM_ScoreF1(context,question,goldAnswer=""):
  # Perform question-answering
  predicted_result = qa_pipeline({
      'question': question,
      'context': context
  })

  # Ground truth (the correct answer)
  ground_truth = goldAnswer

  # Get the predicted answer
  predicted_answer = predicted_result['answer']

  # Compute Exact Match and F1 Score
  if goldAnswer=="":
    em_score = exact_match_score(predicted_answer, predicted_answer)
  else:
    em_score = exact_match_score(predicted_answer, ground_truth)
  f1 = f1_score(predicted_answer, predicted_answer)
  return(f"Answer: {predicted_result['answer']}"),(f"Exact Match: {em_score}"),(f"F1 Score: {f1}")

In [23]:

# Example context and question
context = """
Hugging Face is an open-source provider of natural language processing tools.
It has developed libraries like 'transformers' which allow easy use of state-of-the-art NLP models.
DistilBERT is a smaller, faster version of BERT, created by Hugging Face by applying knowledge distillation.
"""
question = "Who created DistilBERT?"

# Perform question-answering
predicted_result = qa_pipeline({
    'question': question,
    'context': context
})

# Ground truth (the correct answer)
ground_truth = "Hugging"

# Get the predicted answer
predicted_answer = predicted_result['answer']

# Compute Exact Match and F1 Score
em_score = exact_match_score(predicted_answer, ground_truth)
f1 = f1_score(predicted_answer, ground_truth)

print(f"Exact Match: {em_score}")
print(f"F1 Score: {f1}")

Exact Match: True
F1 Score: 1.0
