In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

# 1. Load & flatten SQuAD JSON
raw = load_dataset(
    "json",
    data_files={"full": "f1_gp_qa.json"},
    field="data"
)["full"]

rows = []
for item in raw:
    for para in item["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            rows.append({
                "context":     ctx,
                "question":    qa["question"],
                "answer_start":qa["answers"][0]["answer_start"],
                "answer_text": qa["answers"][0]["text"],
            })

flat_ds = Dataset.from_list(rows)

# 2. Split into train/validation (80/20)
split = flat_ds.train_test_split(test_size=0.2, seed=42)
train_ds = split["train"]
val_ds   = split["test"]


tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# — 2) Define the robust prepare_features with fallback + clamp —
def prepare_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation="only_second",      # only truncate the context
        stride=128,                    # optional: use sliding windows
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    cls_id = tokenizer.cls_token_id
    starts, ends = [], []

    # iterate over each window
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        sample_idx  = tokenized["overflow_to_sample_mapping"][i]
        start_char  = examples["answer_start"][sample_idx]
        answer_text = examples["answer_text"][sample_idx]
        end_char    = start_char + len(answer_text)

        # token window covers chars from offsets[0][0] to offsets[-1][1]
        window_start, window_end = offsets[0][0], offsets[-1][1]

        if not (window_start <= start_char < window_end):
            # answer fell outside this window → point at CLS
            raw_s = raw_e = tokenized["input_ids"][i].index(cls_id)
        else:
            # find exact token span inside this window
            raw_s = next(
                idx for idx, (st, ed) in enumerate(offsets)
                if st <= start_char < ed
            )
            raw_e = next(
                idx for idx, (st, ed) in reversed(list(enumerate(offsets)))
                if st < end_char <= ed
            )

        # clamp to [0, seq_len-1]
        seq_len = len(offsets)
        s = max(0, min(raw_s, seq_len - 1))
        e = max(0, min(raw_e, seq_len - 1))

        starts.append(s)
        ends.append(e)

    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    tokenized.pop("offset_mapping")
    return tokenized

# — 3) Apply to train & validation splits —
train_tokenized = train_ds.map(
    prepare_features,
    batched=True,
    remove_columns=["context", "question", "answer_start", "answer_text"]
)
val_tokenized = val_ds.map(
    prepare_features,
    batched=True,
    remove_columns=["context", "question", "answer_start", "answer_text"]
)

print(f"▶️ Train examples: {len(train_tokenized)}")
print(f"▶️ Val   examples: {len(val_tokenized)}")

Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

▶️ Train examples: 4121
▶️ Val   examples: 1040


In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer
)

In [3]:
model_name = "roberta-base"

# 2) Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
training_args = TrainingArguments(
    output_dir=f"./{model_name}_qa_model",
    overwrite_output_dir=True,

    do_train=True,
    do_eval=True,

    # Legacy strategy names to match older transformers versions:
    eval_strategy="steps",   # run validation every eval_steps
    save_strategy="steps",   # checkpoint every save_steps
    eval_steps=500,
    save_steps=500,

    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=3e-5,
    weight_decay=0.01,

    load_best_model_at_end=True,
    no_cuda=False,           # set True to force CPU
    use_cpu=False            # set True to force CPU
)

In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [6]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.0001,1.6e-05
1000,0.0,9e-06
1500,0.0,8e-06


TrainOutput(global_step=1548, training_loss=0.040594189073809225, metrics={'train_runtime': 443.6308, 'train_samples_per_second': 27.868, 'train_steps_per_second': 3.489, 'total_flos': 2422808852645376.0, 'train_loss': 0.040594189073809225, 'epoch': 3.0})

In [7]:
# After trainer.train() completes:
trainer.save_model("./qa_roberta_checkpoint")
# This saves both:
#  - model weights → ./qa_model_checkpoint/pytorch_model.bin
#  - config & tokenizer files → ./qa_model_checkpoint/config.json, tokenizer files…


In [1]:
print(start_logits[0][:5], end_logits[0][:5])


NameError: name 'start_logits' is not defined