In [1]:
import os
import numpy as np
import pandas as pd
import evaluate

from datasets import load_dataset, Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import torch

In [2]:
train_raw = load_dataset(
    "json",
    data_files="f1_train_new.json",
    field="data"
)["train"]

train_rows = []
for item in train_raw:
    for para in item["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            if not qa["answers"]:
                continue
            train_rows.append({
                "question":    qa["question"],
                "context":     ctx,
                "answer_text": qa["answers"][0]["text"]
            })
train_ds = Dataset.from_list(train_rows)

In [3]:
val_raw = load_dataset(
    "json",
    data_files="f1_test_new.json",
    field="data"
)["train"]

val_rows = []
for item in val_raw:
    for para in item["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            if not qa["answers"]:
                continue
            val_rows.append({
                "question":    qa["question"],
                "context":     ctx,
                "answer_text": qa["answers"][0]["text"]
            })
val_ds = Dataset.from_list(val_rows)

In [4]:
model_name = "google/flan-t5-base"
tokenizer  = T5TokenizerFast.from_pretrained(model_name)
model      = T5ForConditionalGeneration.from_pretrained(model_name)

In [5]:
max_input_length  = 512
max_target_length = 32

def preprocess_fn(examples):
    inputs = [
        f"question: {q}  context: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]
    mi = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        lbl = tokenizer(
            examples["answer_text"],
            max_length=32,
            truncation=True,
            padding="max_length"
        )
    # Here’s the critical bit: convert any possible numpy ints -> native Python ints
    mi_input_ids = [[int(x) for x in seq] for seq in mi["input_ids"]]
    mi_attention = [[int(x) for x in seq] for seq in mi["attention_mask"]]
    lbl_ids      = [[int(x) for x in seq] for seq in lbl["input_ids"]]

    return {
        "input_ids":      mi_input_ids,
        "attention_mask": mi_attention,
        "labels":         lbl_ids,
    }

In [6]:
train_proc = train_ds.map(
    preprocess_fn,
    batched=True,
    remove_columns=train_ds.column_names
)
val_proc = val_ds.map(
    preprocess_fn,
    batched=True,
    remove_columns=val_ds.column_names
)

Map:   0%|          | 0/124 [00:00<?, ? examples/s]



Map:   0%|          | 0/124 [00:00<?, ? examples/s]

In [7]:
train_proc.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])
val_proc  .set_format(type="python", columns=["input_ids", "attention_mask", "labels"])
model.config.label_pad_token_id = -100

In [8]:
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
def collate_fn(batch):
    # batch is a list of dicts each with keys: input_ids, attention_mask, labels
    input_ids = [torch.tensor(example["input_ids"],      dtype=torch.long) for example in batch]
    attention_mask = [torch.tensor(example["attention_mask"], dtype=torch.long) for example in batch]
    labels     = [torch.tensor(example["labels"],        dtype=torch.long) for example in batch]

    # pad to the max length in this batch
    batch_input_ids      = pad_sequence(input_ids,      batch_first=True, padding_value=tokenizer.pad_token_id)
    batch_attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    # pad labels with -100 so CrossEntropyLoss ignores them
    batch_labels         = pad_sequence(labels,       batch_first=True, padding_value=-100)

    return {
        "input_ids":      batch_input_ids,
        "attention_mask": batch_attention_mask,
        "labels":         batch_labels,
    }

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
bleu_metric  = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
chrf_metric  = evaluate.load("chrf")

In [11]:
import numpy as np
import torch

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # 1) If generate returned (preds, scores), grab the first element
    if isinstance(preds, tuple):
        preds = preds[0]

    # 2) Ensure preds is a NumPy array of ints
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()
    else:
        preds = np.array(preds)

    # 3) Clamp to valid token‐ID range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1).astype(int)

    # 4) Decode predictions
    decoded_preds = tokenizer.batch_decode(
        preds.tolist(),
        skip_special_tokens=True
    )

    # 5) Prepare labels the same way
    if isinstance(labels, tuple):  # sometimes returned as (labels, _)
        labels = labels[0]
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    else:
        labels = np.array(labels)

    # replace -100 with pad_token_id, then clamp
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, tokenizer.vocab_size - 1).astype(int)

    decoded_labels = tokenizer.batch_decode(
        labels.tolist(),
        skip_special_tokens=True
    )

    # 6) Compute text‐similarity metrics
    bleu  = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )["bleu"]
    rouge = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )["rougeL"]
    chrf  = chrf_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["score"]

    return {"bleu": bleu, "rougeL": rouge, "chrf": chrf}

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_f1_qa",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    learning_rate=1e-4,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,

    # <— key flag so Trainer doesn't munge your columns
    remove_unused_columns=False,
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_proc,
    eval_dataset=val_proc,
    data_collator=collate_fn,    # our safe, explicit collator
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu,Rougel,Chrf
1,8.5732,0.476997,0.0,0.051075,5.319943
2,0.4289,0.182211,0.0,0.303763,15.701203
3,0.1979,0.166219,0.0,0.354839,25.609368
4,0.1483,0.148829,0.0,0.416667,25.136468
5,0.1363,0.147047,0.0,0.451613,25.687915
6,0.1193,0.146635,0.0,0.456989,22.064866
7,0.1123,0.153315,0.0,0.454301,25.088888
8,0.0978,0.150151,0.0,0.459677,22.888546
9,0.0979,0.154932,0.0,0.47043,21.990063
10,0.0874,0.159066,0.0,0.47043,21.888758


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=620, training_loss=0.9999293211967715, metrics={'train_runtime': 346.03, 'train_samples_per_second': 3.584, 'train_steps_per_second': 1.792, 'total_flos': 849099117035520.0, 'train_loss': 0.9999293211967715, 'epoch': 10.0})

In [15]:
trainer.save_model("./flan_t5_f1_qa")
tokenizer.save_pretrained("./flan_t5_f1_qa")

('./flan_t5_f1_qa/tokenizer_config.json',
 './flan_t5_f1_qa/special_tokens_map.json',
 './flan_t5_f1_qa/tokenizer.json')