In [3]:
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import Dataset

In [4]:
from google.colab import files

uploaded = files.upload()

Saving dev-v1.1.json to dev-v1.1 (1).json
Saving train-v1.1.json to train-v1.1 (1).json


In [5]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

train_data = load_data('train-v1.1.json')  # Use the filename you uploaded
dev_data = load_data('dev-v1.1.json')

In [6]:
def extract_data(input_data):
    contexts, questions, answers = [], [], []

    for doc in input_data['data']:
        for para in doc['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                answer = qa['answers'][0]

                contexts.append(context)
                questions.append(question)
                answers.append({
                    'text': answer['text'],
                    'answer_start': answer['answer_start']
                })

    return Dataset.from_dict({
        'context': contexts,
        'question': questions,
        'answers': answers
    })

In [7]:
train_dataset = extract_data(train_data)
dev_dataset = extract_data(dev_data)

In [8]:
#initialising tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
#tokenization and encoding
def encode_samples(batch):
    questions_cleaned = [q.strip() for q in batch['question']]
    encodings = tokenizer(
        questions_cleaned,
        batch['context'],
        max_length=384,
        truncation=True,
        padding='max_length',
        return_offsets_mapping=True,
        return_tensors='pt'
    )

    start_positions = np.array([ans['answer_start'] for ans in batch['answers']])
    end_positions = np.array([ans['answer_start'] + len(ans['text']) for ans in batch['answers']])

    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions,
    })

    return encodings

In [10]:
encoded_train = train_dataset.map(encode_samples, batched=True)
encoded_dev = dev_dataset.map(encode_samples, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

training_args = TrainingArguments(
    output_dir='./outputs',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_dev
)

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,5.2096,5.0685
2,4.8526,4.83266


TrainOutput(global_step=10950, training_loss=5.1866908800656395, metrics={'train_runtime': 6872.2697, 'train_samples_per_second': 25.493, 'train_steps_per_second': 1.593, 'total_flos': 1.7167621364554752e+16, 'train_loss': 5.1866908800656395, 'epoch': 2.0})

In [16]:
def calculate_token_iou(predicted_span, actual_span):
    predicted_range = set(range(predicted_span['start'], predicted_span['end']))
    actual_range = set(range(actual_span['start'], actual_span['end']))

    intersection = predicted_range.intersection(actual_range)
    union = predicted_range.union(actual_range)

    return len(intersection) / len(union) if union else 0

In [20]:
from transformers import pipeline

qa_pipeline = pipeline(
    task="question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0
)

In [21]:
#example
context = 'Jack lives on Mars.'
question = 'Where does Jack live?'

In [22]:
prediction_output = qa_pipeline({
    'context': context,
    'question': question
})

In [23]:
#extracting predicted answer and span information
predicted_answer_text = prediction_output['answer']
predicted_start_pos = prediction_output['start']
predicted_end_pos = prediction_output['end']

In [24]:
#true answer for comparison
true_answer = 'Mars'
true_start_pos = context.find(true_answer)
true_end_pos = true_start_pos + len(true_answer)

In [25]:
predicted_span = {'start': predicted_start_pos, 'end': predicted_end_pos}
actual_span = {'start': true_start_pos, 'end': true_end_pos}

In [26]:
token_iou = calculate_token_iou(predicted_span, actual_span)

In [33]:
print('Predicted Answer:', predicted_answer_text)
print('True Answer:', true_answer)
print('IoU Score:', token_iou)

Predicted Answer: Mars.
True Answer: Mars
IoU Score: 0.8
