In [1]:
!pip install transformers datasets evaluate


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from datasets import load_dataset
import pandas as pd

# Load SQuAD dataset
dataset = load_dataset("squad")

# Convert to DataFrames
train_df = pd.DataFrame(dataset["train"])
val_df = pd.DataFrame(dataset["validation"])

# Save to Google Drive as CSV
train_df.to_csv("/content/drive/MyDrive/squad_train.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/squad_validation.csv", index=False)

print("Datasets saved to Drive successfully!")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Datasets saved to Drive successfully!


In [3]:
#Load Tokenizer & BERT Model
from transformers import BertTokenizerFast, BertForQuestionAnswering

# Load pretrained tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Tokenize Dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully contained in the context, label it with 0 (token ID for [CLS])
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Tokenize train and validation sets
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
# Save Tokenized Dataset to Disk or Google Drive
tokenized_train.save_to_disk("/content/drive/MyDrive/tokenized_squad_train")
tokenized_val.save_to_disk("/content/drive/MyDrive/tokenized_squad_val")


Saving the dataset (0/1 shards):   0%|          | 0/88524 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10784 [00:00<?, ? examples/s]

In [4]:
#Load It Next Time Without Re-Tokenizing
from datasets import load_from_disk

tokenized_train = load_from_disk("/content/drive/MyDrive/tokenized_squad_train")
tokenized_val = load_from_disk("/content/drive/MyDrive/tokenized_squad_val")


In [25]:
all_train_questions = dataset["train"]["question"]

print(f"Total number of questions: {len(all_train_questions)}\n")

# Print first 100 questions with numbering
for i, question in enumerate(all_train_questions[:100], start=1):
    print(f"{i}. {question}")


Total number of questions: 87599

1. To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
2. What is in front of the Notre Dame Main Building?
3. The Basilica of the Sacred heart at Notre Dame is beside to which structure?
4. What is the Grotto at Notre Dame?
5. What sits on top of the Main Building at Notre Dame?
6. When did the Scholastic Magazine of Notre dame begin publishing?
7. How often is Notre Dame's the Juggler published?
8. What is the daily student paper at Notre Dame called?
9. How many student news papers are found at Notre Dame?
10. In what year did the student paper Common Sense begin publication at Notre Dame?
11. Where is the headquarters of the Congregation of the Holy Cross?
12. What is the primary seminary of the Congregation of the Holy Cross?
13. What is the oldest structure at Notre Dame?
14. What individuals live at Fatima House at Notre Dame?
15. Which prize did Frederick Buechner create?
16. How many BS level degrees are offered in the Coll

In [26]:
val_questions = dataset["validation"]["question"]

print(f"Total validation questions: {len(val_questions)}\n")

for i, question in enumerate(val_questions[:10], start=1):
    print(f"{i}. {question}")


Total validation questions: 10570

1. Which NFL team represented the AFC at Super Bowl 50?
2. Which NFL team represented the NFC at Super Bowl 50?
3. Where did Super Bowl 50 take place?
4. Which NFL team won Super Bowl 50?
5. What color was used to emphasize the 50th anniversary of the Super Bowl?
6. What was the theme of Super Bowl 50?
7. What day was the game played on?
8. What is the AFC short for?
9. What was the theme of Super Bowl 50?
10. What does AFC stand for?


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-qa",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100
)

In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkoushikimadishetti[0m ([33mkoushikimadishetti-jntuhucej[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return forward_call(*args, **kwargs)


Step,Training Loss
100,4.3954
200,2.8174
300,2.4087
400,1.9281
500,1.9305
600,1.7245
700,1.6031
800,1.5476
900,1.6153
1000,1.5749


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Step,Training Loss
100,4.3954
200,2.8174
300,2.4087
400,1.9281
500,1.9305
600,1.7245
700,1.6031
800,1.5476
900,1.6153
1000,1.5749


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=11066, training_loss=1.2635049073622453, metrics={'train_runtime': 7090.545, 'train_samples_per_second': 12.485, 'train_steps_per_second': 1.561, 'total_flos': 1.7348275569973248e+16, 'train_loss': 1.2635049073622453, 'epoch': 1.0})

In [8]:
#Save Your Model and Tokenizer
# Define the directory on your Google Drive to save the model
model_dir = "/content/drive/MyDrive/bert_qa_model"

# Save the trained model
trainer.save_model(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

print("✅ Model and tokenizer saved successfully to Google Drive.")


✅ Model and tokenizer saved successfully to Google Drive.


In [None]:
#To Load Later for Inference
from transformers import BertForQuestionAnswering, BertTokenizerFast

model_dir = "/content/drive/MyDrive/bert_qa_model"

model = BertForQuestionAnswering.from_pretrained(model_dir)
tokenizer = BertTokenizerFast.from_pretrained(model_dir)


In [10]:
#Save TrainingArguments for Later Use
# training_args.save("/content/drive/MyDrive/bert_qa_model/training_args.bin")

import json
# Convert the TrainingArguments object to a dictionary
training_args_dict = training_args.to_dict()

# Define the path to save the JSON file
training_args_path = "/content/drive/MyDrive/bert_qa_model/training_args.json"

# Save the dictionary as a JSON file
with open(training_args_path, "w") as f:
    json.dump(training_args_dict, f, indent=4)

print(f"✅ Training arguments saved successfully to {training_args_path}")

✅ Training arguments saved successfully to /content/drive/MyDrive/bert_qa_model/training_args.json


In [13]:
#Define QA Prediction Function
import torch

def predict_answer(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512)

    # Move inputs to the same device as the model
    device = model.device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits)

    # Move the input_ids back to CPU for token decoding
    input_ids = inputs["input_ids"][0].cpu()
    answer_tokens = input_ids[start_idx:end_idx + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

In [14]:
# Test on Multiple Questions
predictions = []

# Testing on first 100 validation samples
for i in range(100):
    question = dataset["validation"][i]["question"]
    context = dataset["validation"][i]["context"]
    true_answer = dataset["validation"][i]["answers"]["text"][0]

    predicted = predict_answer(question, context)

    predictions.append({
        "question": question,
        "predicted_answer": predicted,
        "true_answer": true_answer
    })

# Print first 5 predictions
for i in range(5):
    print(f"\nQ: {predictions[i]['question']}")
    print(f"Predicted: {predictions[i]['predicted_answer']}")
    print(f"True: {predictions[i]['true_answer']}")


  return forward_call(*args, **kwargs)



Q: Which NFL team represented the AFC at Super Bowl 50?
Predicted: denver broncos
True: Denver Broncos

Q: Which NFL team represented the NFC at Super Bowl 50?
Predicted: carolina panthers
True: Carolina Panthers

Q: Where did Super Bowl 50 take place?
Predicted: 
True: Santa Clara, California

Q: Which NFL team won Super Bowl 50?
Predicted: denver broncos
True: Denver Broncos

Q: What color was used to emphasize the 50th anniversary of the Super Bowl?
Predicted: gold
True: gold


In [15]:
#Evaluate Model (EM & F1)
import evaluate

metric = evaluate.load("squad")

references = [{"id": str(i), "answers": dataset["validation"][i]["answers"]} for i in range(100)]
preds = [{"id": str(i), "prediction_text": predictions[i]["predicted_answer"]} for i in range(100)]

results = metric.compute(predictions=preds, references=references)

print("\nEvaluation Metrics:")
print(f"Exact Match (EM): {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


Evaluation Metrics:
Exact Match (EM): 75.00
F1 Score: 82.16


In [22]:
#Inference Code to Test the Model
def test_custom_qa_loop():
    while True:
        context = input("\nEnter the context passage (or type 'quit' to exit): ").strip()
        if context.lower() == 'quit':
            print("Exiting the QA system.")
            break
        if not context:
            print("Context cannot be empty. Please enter a valid passage.")
            continue

        while True:
            question = input("\nAsk a question about the above context (or type 'done' to input new context): ").strip()
            if question.lower() == 'done':
                break
            if not question:
                print("Question cannot be empty. Please enter a valid question.")
                continue

            predicted_answer = predict_answer(question, context)
            print(f"\nQ: {question}")
            print(f"A: {predicted_answer}")

        cont = input("\nDo you want to continue with another context? (yes/no): ").strip().lower()
        if cont not in ['yes', 'y']:
            print("Goodbye!")
            break

# Run the QA loop
test_custom_qa_loop()




Enter the context passage (or type 'quit' to exit): The Amazon Rainforest, also known as Amazonia, is a vast tropical rainforest in South America, spanning across nine countries including Brazil, Peru, and Colombia. It is home to over 400 billion individual trees and is one of the most biodiverse places on Earth. The Amazon plays a vital role in regulating the planet’s oxygen and carbon cycles. However, it is under severe threat due to deforestation, illegal logging, and climate change.

Ask a question about the above context (or type 'done' to input new context): What is another name for the Amazon Rainforest?

Q: What is another name for the Amazon Rainforest?
A: amazonia

Ask a question about the above context (or type 'done' to input new context): How many trees are estimated to be in the Amazon?

Q: How many trees are estimated to be in the Amazon?
A: over 400 billion

Ask a question about the above context (or type 'done' to input new context): What are the main threats to the A