In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    pipeline
)

In [2]:
# 1) Load & flatten JSON
raw_ds = load_dataset("json", data_files="f1_gp_qa.json", field="data")
split_name = list(raw_ds.keys())[0]
raw = raw_ds[split_name]
rows = []
for rec in raw:
    for para in rec["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            text = qa["answers"][0]["text"]
            start = qa["answers"][0].get("answer_start", ctx.find(text))
            rows.append({
                "context": ctx,
                "question": qa["question"],
                "answer_text": text,
                "answer_start": start
            })
flat = Dataset.from_list(rows)

In [3]:
# 2) Train/validation split (80/20)
split = flat.train_test_split(test_size=0.2, seed=42)
train_ds = split["train"]
val_ds   = split["test"]

In [4]:
bad_indices = {12, 23, 27, 31, 37, 52, 60}
good_val_ds = val_ds.filter(lambda _, idx: idx not in bad_indices, with_indices=True)

Filter:   0%|          | 0/72 [00:00<?, ? examples/s]

In [5]:
# 4) Initialize tokenizer & model (Longformer 4k)
checkpoint = "allenai/longformer-base-4096"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)
model      = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def prepare_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        max_length=4096,
        truncation="only_second",
        stride=512,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    starts, ends = [], []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        sample_idx = tokenized["overflow_to_sample_mapping"][i]
        sc = examples["answer_start"][sample_idx]
        ec = sc + len(examples["answer_text"][sample_idx])
        s = 0
        while s < len(offsets) and offsets[s][0] <= sc:
            s += 1
        starts.append(s - 1)
        e = len(offsets) - 1
        while e >= 0 and offsets[e][1] >= ec:
            e -= 1
        ends.append(e + 1)
    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    tokenized.pop("offset_mapping")
    return tokenized

In [7]:
# 6) Tokenize train + filtered validation sets
train_tok = train_ds.map(
    prepare_features,
    batched=True,
    remove_columns=train_ds.column_names
)
val_tok = good_val_ds.map(
    prepare_features,
    batched=True,
    remove_columns=good_val_ds.column_names
)
datasets = DatasetDict({"train": train_tok, "validation": val_tok})

Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [8]:
# 7) Configure TrainingArguments (select by eval_loss)
training_args = TrainingArguments(
    output_dir="./lf_drop_bad",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    fp16=True,
    num_train_epochs=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
