<a href="https://colab.research.google.com/github/harish-kumar-kp/LLM-Doubt/blob/main/disBert_CustomSquad_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

In [6]:
"""https://huggingface.co/distilbert-base-uncased-distilled-squad"""
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "distilbert-base-uncased-distilled-squad"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [23]:
res # score == F1 Score: it's computed on the individual words in the prediction vs the true words provided in context

{'score': 0.4187259376049042,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}

In [24]:
model

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

\\# How to fine-tune a QA Model?

In [7]:
pip install datasets



In [3]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [4]:
main_dir = "/content/gdrive/MyDrive/QADistilBert"
os.chdir(main_dir)
os.listdir()

['old',
 'dataset',
 'codes',
 'train_squad_dataset.json',
 'valid_squad_dataset.json',
 'model',
 'wandb',
 'test-trainer',
 '.gradio']

In [10]:
from datasets import load_dataset
data_files  = {"train": "dataset/squad_01/train_customSquad-v1.1.json", "validation": "dataset/squad_01/validation_customSquad-v1.1.json"}

In [26]:
csquad = load_dataset("json", data_files=data_files, field="data")
csquad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 75
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 50
    })
})

In [27]:
csquad['validation'][0]

{'id': 'i76',
 'title': 'Futuristic',
 'context': 'AGI refers to AI systems that can perform any intellectual task that humans can do, surpassing current AI limited to specific tasks. Unlike traditional AI, AGI could comprehend, learn, and apply knowledge across various domains without needing task-specific programming. The implications are vast, from revolutionizing industries and economies to transforming fields like medicine, education, and research. AGI would enable machines to innovate, solve complex problems, and adapt autonomously. Ethical considerations, such as control and safety, are central, as AGI could lead to unpredictable impacts. Safeguards and regulations are necessary to prevent unintended consequences, such as job displacement or AI-driven decision-making biases. If managed responsibly, AGI could improve global welfare by solving critical issues, such as climate change, resource scarcity, and disease eradication, faster than humans can today.',
 'question': 'What is 

In [None]:
# Preprocess the data to a BERT format
def preprocess_function(examples):
    """Courtesy of https://huggingface.co/docs/transformers/tasks/question_answering"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = csquad.map(preprocess_function, batched=True, remove_columns=csquad["train"].column_names)

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
#used for Large Squad Datasets
#small_train_dataset = tokenized_squad["train"].shuffle(seed=42).select(range(100))
#small_eval_dataset = tokenized_squad["validation"].shuffle(seed=42).select(range(100))

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name) # remember that model_name is "distilbert-base-uncased-distilled-squad"

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import numpy as np
import evaluate

In [None]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# customised training parameters
'''
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01

)
'''

In [None]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,1.359052
2,No log,1.340134
3,No log,1.343698


TrainOutput(global_step=30, training_loss=1.4948752085367838, metrics={'train_runtime': 38.8708, 'train_samples_per_second': 5.788, 'train_steps_per_second': 0.772, 'total_flos': 22047710630400.0, 'train_loss': 1.4948752085367838, 'epoch': 3.0})

cc4856cc44f1dae7844c269adf899daa328e7a14

*   List item
*   List item



---



*   List item
*   List item



In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
main_dir = "/content/gdrive/MyDrive/QADistilBert"
os.chdir(main_dir)
os.listdir()

['old',
 'dataset',
 'codes',
 'train_squad_dataset.json',
 'valid_squad_dataset.json',
 'model',
 'wandb',
 'test-trainer']

In [5]:
trainer.save_model("model/save/customTrained_Distilbert_Squad")
trainer.evaluate()

NameError: name 'trainer' is not defined

In [None]:
#torch.save(model.state_dict(), "simple_distilbert_qa.model")

In [None]:
model.save_pretrained("model/distilbert-finetuned-squad")
tokenizer.save_pretrained("model/distilbert-finetuned-squad")


('model/distilbert-finetuned-squad/tokenizer_config.json',
 'model/distilbert-finetuned-squad/special_tokens_map.json',
 'model/distilbert-finetuned-squad/vocab.txt',
 'model/distilbert-finetuned-squad/added_tokens.json',
 'model/distilbert-finetuned-squad/tokenizer.json')

In [28]:
import os
import torch
from transformers import pipeline , AutoTokenizer ,AutoModelForQuestionAnswering
from transformers import DistilBertConfig
from transformers import DistilBertForMaskedLM

In [None]:
#model.load_state_dict(torch.load("simple_distilbert_qa.model"))

In [29]:
# Path to your custom-trained model
model_path = "model/save/customTrained_Distilbert_Squad"

# Load the tokenizer and model
bert_model = AutoModelForQuestionAnswering.from_pretrained(model_path)
bert_tokenizer = AutoTokenizer.from_pretrained(model_path)

'''
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
'''

'\ntokenizer = DistilBertTokenizer.from_pretrained(model_path)\nmodel = DistilBertForSequenceClassification.from_pretrained(model_path)\n'

In [30]:
qa_pipeline = pipeline('question-answering', model=model_path,tokenizer=model_path)

In [31]:
context = """
In this sequel set eleven years after "The Terminator," young John Connor (Edward Furlong), the key to civilization's victory over a future robot uprising,
is the target of the shape-shifting T-1000 (Robert Patrick), a Terminator sent from the future to kill him. Another Terminator, the revamped T-800
(Arnold Schwarzenegger), has been sent back to protect the boy. As John and his mother (Linda Hamilton) go on the run with the T800,
the boy forms an unexpected bond with the robot.
"""
question = "Who is young John Connor?"

In [32]:
result = qa_pipeline({
    'question': question,
    'context': context
})

print(result)

{'score': 0.39979875087738037, 'start': 76, 'end': 91, 'answer': 'Edward Furlong)'}


In [65]:
import string
import re
from collections import Counter

# Function to normalize answers (remove articles, punctuation, etc.)
def normalize_answer(s):
    """Lowercase, remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punctuation(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

# Exact Match calculation
def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# F1 Score calculation
def f1_score(prediction, ground_truth):
    pred_tokens = normalize_answer(prediction).split()
    truth_tokens = normalize_answer(ground_truth).split()

    common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
    num_common = sum(common_tokens.values())

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens)
    recall = num_common / len(truth_tokens)

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [77]:
#from datasets import load_dataset

# Load the SQuAD dataset
dataset = load_dataset("squad", split="validation[:100]")  # Load a small subset for testing
#dataset = load_dataset("csquad")  # Load a small subset for testing
# Initialize scores
total_em, total_f1 = 0, 0

# Iterate over the dataset and compute metrics for each example
for example in dataset:
    question = example['question']
    context = example['context']
    ground_truth = example['answers']['text'][0]  # Ground truth answer (first one if multiple)

    # Get model's prediction
    predicted_result = qa_pipeline({
        'question': question,
        'context': context
    })
    predicted_answer = predicted_result['answer']

    # Compute EM and F1 for this example
    em = exact_match_score(predicted_answer, ground_truth)
    f1 = f1_score(predicted_answer, ground_truth)

    total_em += em
    total_f1 += f1

# Average over the dataset
num_examples = len(dataset)
average_em = total_em / num_examples
average_f1 = total_f1 / num_examples

print(f"Average Exact Match: {average_em}")
print(f"Average F1 Score: {average_f1}")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Average Exact Match: 0.57
Average F1 Score: 0.6818571428571427


In [69]:
def EM_ScoreF1(context,question,goldAnswer=""):
  # Perform question-answering
  predicted_result = qa_pipeline({
      'question': question,
      'context': context
  })
  # Ground truth (the correct answer)
  if goldAnswer=="":
    ground_truth = "Answer Unavailable"
  else:
    ground_truth = goldAnswer

  # Get the predicted answer
  predicted_answer = predicted_result['answer']
  # Compute Exact Match and F1 Score
  em_score = exact_match_score(predicted_answer, ground_truth)
  f1 = f1_score(predicted_answer, ground_truth)
  return(f"Machine Answer: {predicted_result['answer']}"+" Vs 'Human Answer':"+ground_truth), (f"Exact Match: {em_score}"), (f"F1 Score: {f1}")

In [47]:
index = 4

In [75]:
csquad['validation'][index]

{'id': 'i80',
 'title': 'Futuristic',
 'context': 'AGI refers to AI systems that can perform any intellectual task that humans can do, surpassing current AI limited to specific tasks. Unlike traditional AI, AGI could comprehend, learn, and apply knowledge across various domains without needing task-specific programming. The implications are vast, from revolutionizing industries and economies to transforming fields like medicine, education, and research. AGI would enable machines to innovate, solve complex problems, and adapt autonomously. Ethical considerations, such as control and safety, are central, as AGI could lead to unpredictable impacts. Safeguards and regulations are necessary to prevent unintended consequences, such as job displacement or AI-driven decision-making biases. If managed responsibly, AGI could improve global welfare by solving critical issues, such as climate change, resource scarcity, and disease eradication, faster than humans can today.',
 'question': 'What are

In [70]:
# Example context and question

context_var =csquad['validation'][index]['context']
question_var =csquad['validation'][index]['question']
goldAnswer_var = csquad['validation'][index]['answers']['text'][0]


In [71]:
print(question_var)

What are the unintended consequences?


In [72]:
print(EM_ScoreF1(context_var,question_var,goldAnswer_var))

("Machine Answer: job displacement or AI-driven decision-making biases. Vs 'Human Answer':job displacement or AI-driven decision-making biases", 'Exact Match: True', 'F1 Score: 1.0')


#Gradio App To Check EM and F1 score with compared Machine and Human Answers.

In [78]:
import gradio as gr
def EM_ScoreF1(context,question,goldAnswer=""):
  # Perform question-answering
  predicted_result = qa_pipeline({
      'question': question,
      'context': context
  })
  # Ground truth (the correct answer)
  if goldAnswer=="":
    ground_truth = "Answer Unavailable"
  else:
    ground_truth = goldAnswer
  # Get the predicted answer
  predicted_answer = predicted_result['answer']
  # Compute Exact Match and F1 Score
  em_score = exact_match_score(predicted_answer, ground_truth)
  f1 = f1_score(predicted_answer, predicted_answer)
  return(f"'Machine Answer': {predicted_result['answer']}"+"   Vs  'Human Answer':"+ground_truth), (f"Exact Match: {em_score}"), (f"F1 Score: {f1}")

demo = gr.Interface(
    fn=EM_ScoreF1,
    inputs=["text", "text","text"],
    outputs=["text","text","text"],
)

demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://abf779b26ed3157be4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [45]:
pip install gradio


Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Col

In [None]:



# Perform question-answering
predicted_result = qa_pipeline({
    'question': question,
    'context': context
})

# Ground truth (the correct answer)
ground_truth = csquad['validation'][index]['answers']['text'][0]

# Get the predicted answer
predicted_answer = predicted_result['answer']

# Compute Exact Match and F1 Score
em_score = exact_match_score(predicted_answer, ground_truth)
f1 = f1_score(predicted_answer, ground_truth)

print(f"Exact Match: {em_score}")
print(f"F1 Score: {f1}")

Exact Match: True
F1 Score: 1.0
