In [1]:
import numpy as np
import pandas as pd
import evaluate

from datasets import load_dataset, Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    pipeline
)

In [2]:
model_dir = "./flan_t5_f1_qa"
tokenizer = T5TokenizerFast.from_pretrained(model_dir)
model     = T5ForConditionalGeneration.from_pretrained(model_dir)

# Build a simple generation pipeline
qa_gen = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0  # change to -1 if no GPU
)

Device set to use cuda:0


In [4]:
raw_test = load_dataset(
    "json",
    data_files="f1_test_new.json",
    field="data"
)["train"]

test_rows = []
for item in raw_test:
    for para in item["paragraphs"]:
        ctx = para["context"]
        for qa in para["qas"]:
            # skip impossible if you have them
            if not qa["answers"]:
                continue
            test_rows.append({
                "question": qa["question"],
                "context":  ctx,
                "answer":   qa["answers"][0]["text"]
            })
test_ds = Dataset.from_list(test_rows)

In [5]:
preds, refs = [], []
for ex in test_ds:
    prompt = f"question: {ex['question']}  context: {ex['context']}"
    out = qa_gen(prompt, max_length=32, do_sample=False)[0]["generated_text"].strip()
    preds.append(out)
    refs.append(ex["answer"].strip())

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


In [6]:
em = sum(p == r for p, r in zip(preds, refs)) / len(refs)

# Load text‐similarity metrics
bleu  = evaluate.load("bleu").compute(predictions=preds, references=[[r] for r in refs])["bleu"]
rouge = evaluate.load("rouge").compute(predictions=preds, references=refs, use_stemmer=True)["rougeL"]
chrf  = evaluate.load("chrf").compute(predictions=preds, references=refs)["score"]

In [7]:
results = {
    "exact_match": em,
    "bleu":        bleu,
    "rougeL":      rouge,
    "chrf":        chrf
}
print("Test set performance:")
for k, v in results.items():
    print(f" • {k:12s}: {v:.3f}")

Test set performance:
 • exact_match : 0.468
 • bleu        : 0.000
 • rougeL      : 0.468
 • chrf        : 21.818


In [8]:
print("\nSample predictions:")
for i in range(5):
    print(f"Q: {test_ds[i]['question']}")
    print(f"GT: {refs[i]!r}")
    print(f"PR: {preds[i]!r}")
    print("---")


Sample predictions:
Q: Who won the Italian Grand Prix?
GT: 'Max Verstappen'
PR: 'Max Verstappen'
---
Q: Which driver finished in position 1?
GT: 'Max Verstappen'
PR: 'Sergio Perez'
---
Q: Which driver finished in position 2?
GT: 'Sergio Perez'
PR: 'Max Verstappen'
---
Q: Which driver finished in position 3?
GT: 'Carlos Sainz'
PR: 'Charles Leclerc'
---
Q: Which driver finished in position 4?
GT: 'Charles Leclerc'
PR: 'George Russell'
---
