In [1]:
from datasets import load_dataset, Dataset

def flatten_squad(path):
    raw = load_dataset("json", data_files=path, field="data")["train"]
    rows = []
    for item in raw:
        for p in item["paragraphs"]:
            ctx = p["context"]
            for qa in p["qas"]:
                rows.append({
                    "context":      ctx,
                    "question":     qa["question"],
                    "answer_start": qa["answers"][0]["answer_start"],
                    "answer_text":  qa["answers"][0]["text"],
                })
    return Dataset.from_list(rows)

train_ds = flatten_squad("f1_gp_qa_new.json")
val_ds   = flatten_squad("f1_gp_qa_val.json")

In [2]:
from transformers import (
    RobertaTokenizerFast,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def prepare_features(examples):
    # 1) Tokenize without overflowing
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",     # only chop the context
        max_length=512,               # full capacity, no overflow
        padding="max_length",
        return_offsets_mapping=True   # we need this to align spans
    )

    starts, ends = [], []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        start_char = examples["answer_start"][i]
        end_char   = start_char + len(examples["answer_text"][i])

        # 2) Find token_start: first token whose span covers start_char
        token_start = 0
        while (
            token_start < len(offsets) 
            and offsets[token_start][1] <= start_char
        ):
            token_start += 1

        # 3) Find token_end: last token whose span covers end_char
        token_end = len(offsets) - 1
        while (
            token_end >= 0 
            and offsets[token_end][0] >= end_char
        ):
            token_end -= 1

        # 4) Clamp into valid range
        token_start = min(max(token_start, 0), len(offsets) - 1)
        token_end   = min(max(token_end,   0), len(offsets) - 1)

        starts.append(token_start)
        ends.append(token_end)

    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    tokenized.pop("offset_mapping")

    return tokenized
train_tok = train_ds.map(
    prepare_features, batched=True, remove_columns=train_ds.column_names
)
val_tok = val_ds.map(
    prepare_features, batched=True, remove_columns=val_ds.column_names
)
print(f"▶️ Train examples: {len(train_tok)}")
print(f"▶️ Val   examples: {len(val_tok)}")

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

▶️ Train examples: 124
▶️ Val   examples: 124


In [3]:
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")
args  = TrainingArguments(
    output_dir="./rob-f1-qa",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer
)
trainer.train()

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.81715
2,No log,2.565747
3,No log,2.442571
4,No log,2.199984
5,No log,1.972858
6,No log,1.796567
7,No log,1.642395
8,No log,1.585462
9,No log,1.682444
10,No log,1.670906


TrainOutput(global_step=775, training_loss=1.5853458527595765, metrics={'train_runtime': 413.152, 'train_samples_per_second': 7.503, 'train_steps_per_second': 1.876, 'total_flos': 810019945881600.0, 'train_loss': 1.5853458527595765, 'epoch': 25.0})

In [4]:
def compute_f1(prediction: str, reference: str) -> float:
    pred_tokens = prediction.split()
    ref_tokens  = reference.split()
    common = set(pred_tokens) & set(ref_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall    = len(common) / len(ref_tokens)
    return 2 * (precision * recall) / (precision + recall)

# 2) Evaluate EM, F1 on val_ds
def evaluate_on_val(trainer, val_ds):
    qa_pipe = pipeline(
        "question-answering",
        model=trainer.model,
        tokenizer=trainer.tokenizer,
        device=-1  # or device=0 if you want GPU
    )
    em_scores, f1_scores = [], []
    for ex in val_ds:
        out   = qa_pipe({"question": ex["question"], "context": ex["context"]})
        pred  = out["answer"].strip()
        truth = ex["answer_text"].strip()
        em    = 1.0 if pred == truth else 0.0
        f1    = compute_f1(pred, truth)
        em_scores.append(em)
        f1_scores.append(f1)

    accuracy = sum(em_scores) / len(em_scores)
    avg_f1    = sum(f1_scores) / len(f1_scores)
    return accuracy, avg_f1

In [5]:
import math
from itertools import product
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    pipeline
)

In [None]:
learning_rates = [1e-2, 1e-3, 1e-4]
epoch_list     = [20, 25, 30]
batch_size     = 4

best_loss = math.inf
best_cfg  = None

for lr, epochs in product(learning_rates, epoch_list):
    print(f"\n▶ Trial: lr={lr}, epochs={epochs}")

    args = TrainingArguments(
        output_dir=f"./rob-lr{lr}-ep{epochs}",
        overwrite_output_dir=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        num_train_epochs=epochs,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model_init=lambda: AutoModelForQuestionAnswering.from_pretrained("roberta-base"),
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,
    )

    trainer.train()
    metrics = trainer.evaluate()
    val_loss = metrics["eval_loss"]
    accuracy, f1 = evaluate_on_val(trainer, val_ds)

    print(f"→ Val loss: {val_loss:.4f}  EM/Accuracy: {accuracy:.3f}  F1: {f1:.3f}")

    if val_loss < best_loss:
        best_loss = val_loss
        best_cfg  = {"learning_rate": lr, "epochs": epochs, "EM": accuracy, "F1": f1}

print("\nBest config:", best_cfg, f"with eval_loss={best_loss:.4f}")


▶ Trial: lr=0.01, epochs=20


  trainer = Trainer(
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,6.238326
2,6.327900,6.238326
3,6.327900,6.238326
4,6.442000,6.238326
5,6.406400,6.238326
6,6.406400,6.238326
