# Fine-tuning DistilBERT for question answering

This guide describes fine-tuning DistilBERT with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.

This guide is adapted from HuggingFace question answering task recipe page: https://huggingface.co/docs/transformers/en/tasks/question_answering

Pretrained DistilBERT: https://huggingface.co/docs/transformers/en/model_doc/distilbert

SQuAD dataset: https://huggingface.co/datasets/rajpurkar/squad

# Install the KubeFlow SDK and dependencies

In [1]:
# TODO: Change the version of SDK when we have the first release of Trainer SDK
!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk
!pip install "cloudpathlib[gs]" "transformers[torch]"

Collecting git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk
  Cloning https://github.com/kubeflow/trainer.git (to revision master) to /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se
  Running command git clone --filter=blob:none --quiet https://github.com/kubeflow/trainer.git /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se
  Resolved https://github.com/kubeflow/trainer.git to commit 3781eda0e675c655d03bc4cb84cce4362f601e44
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


# Define the HuggingFace training script

We need to wrap our training script into a function to create the Kubeflow TrainJob.

In [2]:
def train_distilbert(args):
    import os

    from cloudpathlib import CloudPath
    from datasets import load_dataset
    import torch
    from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer

    import torch.distributed as dist

    # Initialize distributed environment
    _, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo")
    dist.init_process_group(backend=backend)

    local_rank = int(os.getenv("LOCAL_RANK", 0))
    print(
        "Distributed Training with WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}.".format(
            dist.get_world_size(),
            dist.get_rank(),
            local_rank,
        )
    )

    # Download the dataset and tokenizer
    squad = load_dataset("squad", split="train[:5000]")    

    squad = squad.train_test_split(test_size=0.2)
    
    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    
    # Define the preprocessing function
    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )
    
        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []
    
        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)
    
            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1
    
            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)
    
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)
    
        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs
        
    # Apply the preprocessing function to the dataset
    tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
        
    # Create a batch of examples using DefaultDataCollator
    data_collator = DefaultDataCollator()

    # Load the model
    model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

    # Define training hyperparameters
    training_args = TrainingArguments(
        output_dir=args["MODEL_NAME"],
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,
    )
    
    # Prepare trainer with configuration
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_squad["train"],
        eval_dataset=tokenized_squad["test"],
        processing_class=tokenizer,
        data_collator=data_collator,
    )
    
    trainer.train()

    CloudPath(f'gs://{args["BUCKET"]}/{args["MODEL_NAME"]}').upload_from(args["MODEL_NAME"])

In [3]:
from kubeflow.trainer import TrainerClient, CustomTrainer

for r in TrainerClient().list_runtimes():
    print(f"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}")
    print(f"Entrypoint: {r.trainer.entrypoint[:3]}")



Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer
Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']

Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer
Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']

Name: mpi-distributed, Framework: torch, Trainer Type: CustomTrainer
Entrypoint: ['torchrun']

Name: torch-distributed, Framework: torch, Trainer Type: CustomTrainer
Entrypoint: ['torchrun']



In [4]:
BUCKET = "tmp-kftrainer"
MODEL_NAME = "qa-distilbert"
args = {
    "BUCKET": BUCKET,
    "MODEL_NAME": MODEL_NAME,
}

job_id = TrainerClient().train(
    trainer=CustomTrainer(
        func=train_distilbert,
        func_args=args,
        num_nodes=2,
        packages_to_install=["datasets", "transformers[torch]", "cloudpathlib[gs]"],
        resources_per_node={
            "cpu": "3",
            "memory": "8Gi",
            "nvidia.com/gpu": 1,
        },
    ),
)

In [5]:
# Train API generates a random TrainJob id.
job_id

'ac43d22fc37e'

# Check the TrainJob details

Use `list_jobs()` and `get_job()` APIs to get details about the created TrainJob and its steps.

In [12]:
for job in TrainerClient().list_jobs():
    print(f"TrainJob: {job.name}, Status: {job.status}, Created at: {job.creation_timestamp}")

TrainJob: ac43d22fc37e, Status: Created, Created at: 2025-04-01 15:34:40+00:00


In [15]:
# We execute mpirun command on node-0, which functions as the MPI Launcher node.
for c in TrainerClient().get_job(name=job_id).steps:
    print(f"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}")

Step: node-0, Status: Running, Devices: gpu x 1
Step: node-1, Status: Running, Devices: gpu x 1


# Show the TrainJob logs

Use `get_job_logs()` API to retrieve the TrainJob logs.

In [16]:
_ = TrainerClient().get_job_logs(name=job_id, follow=True)

[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.
Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 542991.51 examples/s]
Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 505234.17 examples/s]
Map: 100%|██████████| 4000/4000 [00:02<00:00, 1966.78 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1873.66 examples/s]
[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 125/375 [01:01<01:45

# Inference

Download the model and run inference on some examples.

In [23]:
from cloudpathlib import CloudPath
from pathlib import Path

_ = CloudPath(f'gs://tmp-kftrainer/{MODEL_NAME}').download_to(MODEL_NAME)

In [21]:
from transformers import pipeline

question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

question_answerer = pipeline("question-answering", model=f"./{MODEL_NAME}/checkpoint-375")
question_answerer(question=question, context=context)

Device set to use mps:0


{'score': 0.13226985931396484,
 'start': 10,
 'end': 95,
 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}

# Clean up

To delete the TrainJob you can use the `delete_job()` API and pass the generated `job_id`.

In [22]:
_ = TrainerClient().delete_job(job_id)