In [1]:
from datasets import load_dataset, Dataset

def flatten_squad(path):
    raw = load_dataset("json", data_files=path, field="data")["train"]
    rows = []
    for item in raw:
        for p in item["paragraphs"]:
            ctx = p["context"]
            for qa in p["qas"]:
                rows.append({
                    "context":      ctx,
                    "question":     qa["question"],
                    "answer_start": qa["answers"][0]["answer_start"],
                    "answer_text":  qa["answers"][0]["text"],
                })
    return Dataset.from_list(rows)

train_ds = flatten_squad("f1_gp_qa_new.json")
val_ds   = flatten_squad("f1_gp_qa_val.json")

In [2]:
from transformers import (
    RobertaTokenizerFast,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def prepare_features(examples):
    # 1) Tokenize without overflowing
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",     # only chop the context
        max_length=512,               # full capacity, no overflow
        padding="max_length",
        return_offsets_mapping=True   # we need this to align spans
    )

    starts, ends = [], []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        start_char = examples["answer_start"][i]
        end_char   = start_char + len(examples["answer_text"][i])

        # 2) Find token_start: first token whose span covers start_char
        token_start = 0
        while (
            token_start < len(offsets) 
            and offsets[token_start][1] <= start_char
        ):
            token_start += 1

        # 3) Find token_end: last token whose span covers end_char
        token_end = len(offsets) - 1
        while (
            token_end >= 0 
            and offsets[token_end][0] >= end_char
        ):
            token_end -= 1

        # 4) Clamp into valid range
        token_start = min(max(token_start, 0), len(offsets) - 1)
        token_end   = min(max(token_end,   0), len(offsets) - 1)

        starts.append(token_start)
        ends.append(token_end)

    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    tokenized.pop("offset_mapping")

    return tokenized
train_tok = train_ds.map(
    prepare_features, batched=True, remove_columns=train_ds.column_names
)
val_tok = val_ds.map(
    prepare_features, batched=True, remove_columns=val_ds.column_names
)
print(f"▶️ Train examples: {len(train_tok)}")
print(f"▶️ Val   examples: {len(val_tok)}")

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

▶️ Train examples: 124
▶️ Val   examples: 124


In [16]:
import evaluate
from transformers import pipeline

# 1) Load metrics
bleu_metric  = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
chrf_metric  = evaluate.load("chrf")

qa_pipe = pipeline(
    "question-answering",
    model=trainer.model,
    tokenizer=trainer.tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

def compute_metrics(_):
    preds, refs = [], []
    for ex in val_ds:
        out  = qa_pipe({"question": ex["question"], "context": ex["context"]})
        pred = out["answer"].strip()
        ref  = ex["answer_text"].strip()
        preds.append(pred)
        refs.append(ref)

    # BLEU: predictions=list[str], references=list[list[str]]
    bleu = bleu_metric.compute(
        predictions=preds,
        references=[[r] for r in refs]
    )["bleu"]

    # ROUGE: raw strings → returns dict of floats
    rouge_scores = rouge_metric.compute(
        predictions=preds,
        references=refs,
        use_stemmer=True
    )
    rouge_l = rouge_scores["rougeL"]  # already a float

    # chrF: raw strings
    chrf = chrf_metric.compute(
        predictions=preds,
        references=refs
    )["score"]

    return {
        "bleu":   bleu,
        "rougeL": rouge_l,
        "chrf":   chrf,
    }


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0


In [19]:
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
import torch

model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")
training_args = TrainingArguments(
    output_dir="./roberta-qa-model",
    overwrite_output_dir=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",

    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=30,
    learning_rate=2e-4,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Rougel,Chrf
1,3.3798,1.884452,0.0,0.047485,5.222055
2,2.052,2.28696,0.0,0.047485,5.222055
3,1.8952,1.853586,0.0,0.047485,5.222055
4,3.8911,2.158793,0.0,0.047485,5.222055
5,2.0936,2.586417,0.0,0.047485,5.222055




TrainOutput(global_step=155, training_loss=2.6623558290543095, metrics={'train_runtime': 75.3409, 'train_samples_per_second': 49.376, 'train_steps_per_second': 12.344, 'total_flos': 162003989176320.0, 'train_loss': 2.6623558290543095, 'epoch': 5.0})

In [23]:
import pandas as pd

# 1) Inspect what metrics actually landed in your log history
logs = trainer.state.log_history
df   = pd.DataFrame(logs)
print(df.columns.tolist())  # look for 'eval_bleu', 'eval_rougeL', 'eval_chrf'

# 2) Keep only epoch‐end rows
df_epoch = df[df["epoch"].notnull()]

# 3) Select the correctly-named eval metrics
metrics_df = df_epoch[[
    "epoch",
    "loss",          # training loss
    "eval_loss",     # validation loss
    "eval_bleu",     # BLEU score
    "eval_rougeL",   # ROUGE-L score
    "eval_chrf"      # chrF score
]].rename(columns={
    "loss":        "train_loss",
    "eval_loss":   "val_loss",
    "eval_bleu":   "val_bleu",
    "eval_rougeL": "val_rougeL",
    "eval_chrf":   "val_chrf",
})

# 4) Save & show
metrics_df.to_csv("epoch_textsim_metrics.csv", index=False)
metrics_df

['loss', 'grad_norm', 'learning_rate', 'epoch', 'step', 'eval_loss', 'eval_bleu', 'eval_rougeL', 'eval_chrf', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'train_runtime', 'train_samples_per_second', 'train_steps_per_second', 'total_flos', 'train_loss']


Unnamed: 0,epoch,train_loss,val_loss,val_bleu,val_rougeL,val_chrf
0,1.0,3.3798,,,,
1,1.0,,1.884452,0.0,0.047485,5.222055
2,2.0,2.052,,,,
3,2.0,,2.28696,0.0,0.047485,5.222055
4,3.0,1.8952,,,,
5,3.0,,1.853586,0.0,0.047485,5.222055
6,4.0,3.8911,,,,
7,4.0,,2.158793,0.0,0.047485,5.222055
8,5.0,2.0936,,,,
9,5.0,,2.586417,0.0,0.047485,5.222055
