In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-question-answering-dataset/train-v1.1.json
/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json


In [2]:
import json

# Define the file paths
train_path = '/kaggle/input/stanford-question-answering-dataset/train-v1.1.json'
dev_path = '/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json'

# Open and load the JSON files
with open(train_path, "r") as file:
    train = json.load(file)

with open(dev_path, "r") as file:
    dev = json.load(file)


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset, load_metric, Dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(question, context, answer_start_char, answer_end_char):
    inputs = tokenizer(
        question,
        context,
        max_length=config["max_length"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset = inputs.pop("offset_mapping")
    sequence_ids = inputs.sequence_ids()

    context_start, context_end = -1, -1

    # Add logic to find the token indices for context start and context end using `sequence_ids``.
    for i, seq_id in enumerate(sequence_ids):
      if seq_id==1 and context_start== -1:
        context_start=i
      elif seq_id != 1 and context_start != -1:
        context_end =i
        break
    if context_end ==-1:
      context_end = len(sequence_ids)


    context_offsets = offset[context_start: context_end]

    # Create a mapping of charcter index to token index.
    charcter_pos_to_token_pos = {}
    for token_pos, (char_start, char_end) in enumerate(context_offsets):
        for char_pos in range(char_start, char_end):
          charcter_pos_to_token_pos[char_pos] = token_pos+context_start

    start_pos = charcter_pos_to_token_pos.get(answer_start_char, 0)
    end_pos = charcter_pos_to_token_pos.get(
        answer_end_char - 1,
        0 if start_pos == 0 else config['max_length'] - 1
    )

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos

    return inputs

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [5]:
def preprocess_data(examples):
    preprocessed_examples = []

    # Iterate through each article in the dataset
    for article in examples['data']:
        # Iterate through each paragraph in the article
        for paragraph in article['paragraphs']:
            context = paragraph['context']  # Extract the context text

            # Iterate through each question-answer (QA) pair in the paragraph
            for qa in paragraph['qas']:
                question = qa['question']  # Extract the question text
                answers = qa['answers']    # Extract the list of answers

                # For each answer, get the start and end positions
                for answer in answers:
                    start_char = answer['answer_start']
                    end_char = start_char + len(answer['text'])

                    # Preprocess and append the example
                    preprocessed_examples.append(preprocess_function(question, context, start_char, end_char))

    return preprocessed_examples

In [6]:
config = {
    "max_length": 384,  # You can adjust this value based on your requirements
    "doc_stride": 128,   # This is often used for splitting long contexts
    "batch_size": 16,    # Adjust based on your hardware capacity
    "epochs": 3,         # Number of training epochs
    "learning_rate": 3e-5, # Learning rate for the optimizer
}

In [7]:
preprocessed_train_data = preprocess_data(train)
preprocessed_dev_data = preprocess_data(dev)

train_dataset = Dataset.from_pandas(pd.DataFrame(preprocessed_train_data))
dev_dataset = Dataset.from_pandas(pd.DataFrame(preprocessed_dev_data))

In [8]:
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Token Level IoU
from transformers import EvalPrediction

def compute_token_level_iou(eval_pred: EvalPrediction):

    # Unpack the predictions and label_ids
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids

    # Convert labels to a NumPy array if it's a tuple
    if isinstance(labels, tuple):
        labels = np.array(labels)

    # Assuming predictions are logits for start and end positions
    # Split the predictions into start and end logits
    start_logits, end_logits = predictions
    # Convert logits to predicted start and end positions
    pred_starts = np.argmax(start_logits, axis=1)
    pred_ends = np.argmax(end_logits, axis=1)

    # Extract true start and end positions from labels
    # Assuming labels contain start and end positions
    # Depending on how labels are structured, you might need to adjust this
    true_starts = labels[:, 0]
    true_ends = labels[:, 1]

    # Compute IoU for each prediction
    iou_scores = []
    for pred_start, pred_end, true_start, true_end in zip(pred_starts, pred_ends, true_starts, true_ends):
        # Calculate intersection
        intersection_start = max(pred_start, true_start)
        intersection_end = min(pred_end, true_end)
        intersection = max(0, intersection_end - intersection_start + 1)

        # Calculate union
        union_start = min(pred_start, true_start)
        union_end = max(pred_end, true_end)
        union = union_end - union_start + 1

        # Compute IoU
        iou = intersection / union if union > 0 else 0.0
        iou_scores.append(iou)

    # Calculate the average IoU over all samples
    average_iou = np.mean(iou_scores)

    return {"token_level_IoU": average_iou}

In [12]:
import wandb

wandb.login(key='225dc77b79b3ab5078f63b1794f235d7b2ccca9a')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    run_name='Question_Answering',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_token_level_iou)



In [14]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mwangdeismail[0m ([33mwangdeismail-mpstme[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114631533332107, max=1.0…

Epoch,Training Loss,Validation Loss,Token Level Iou
1,1.2355,1.254838,0.5


TrainOutput(global_step=5475, training_loss=1.538253870576484, metrics={'train_runtime': 2051.7759, 'train_samples_per_second': 42.694, 'train_steps_per_second': 2.668, 'total_flos': 8583810682277376.0, 'train_loss': 1.538253870576484, 'epoch': 1.0})

In [15]:
# Inference pipeline

def answer_question(question, context):
    # Check the device of the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Prepare inputs and move to the correct device
    inputs = tokenizer(question, context, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start:answer_end])
    )

    return answer

# Example question
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."
print(answer_question(question, context))

paris


In [16]:
trainer.save_model()  # Save the model to a specific directory


In [17]:
import os

for dirname, _, filenames in os.walk('./results'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./results/model.safetensors
./results/vocab.txt
./results/tokenizer_config.json
./results/training_args.bin
./results/config.json
./results/tokenizer.json
./results/special_tokens_map.json
./results/runs/Sep06_05-41-14_eab0a071fa89/events.out.tfevents.1725601277.eab0a071fa89.37.0
./results/runs/Sep06_05-55-52_eab0a071fa89/events.out.tfevents.1725602158.eab0a071fa89.37.1
./results/checkpoint-5475/model.safetensors
./results/checkpoint-5475/vocab.txt
./results/checkpoint-5475/tokenizer_config.json
./results/checkpoint-5475/training_args.bin
./results/checkpoint-5475/config.json
./results/checkpoint-5475/tokenizer.json
./results/checkpoint-5475/trainer_state.json
./results/checkpoint-5475/rng_state.pth
./results/checkpoint-5475/special_tokens_map.json
./results/checkpoint-5475/optimizer.pt
./results/checkpoint-5475/scheduler.pt
./results/checkpoint-1500/model.safetensors
./results/checkpoint-1500/vocab.txt
./results/checkpoint-1500/tokenizer_config.json
./results/checkpoint-1500/training_