In [1]:
from datasets import load_dataset, Dataset

def flatten_squad(path):
    raw = load_dataset("json", data_files=path, field="data")["train"]
    rows = []
    for item in raw:
        for p in item["paragraphs"]:
            ctx = p["context"]
            for qa in p["qas"]:
                rows.append({
                    "context":      ctx,
                    "question":     qa["question"],
                    "answer_start": qa["answers"][0]["answer_start"],
                    "answer_text":  qa["answers"][0]["text"],
                })
    return Dataset.from_list(rows)

train_ds = flatten_squad("f1_train_new.json")
val_ds   = flatten_squad("f1_test_new.json")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
from transformers import (
    RobertaTokenizerFast,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def prepare_features(examples):
    # 1) Tokenize without overflowing
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",     # only chop the context
        max_length=512,               # full capacity, no overflow
        padding="max_length",
        return_offsets_mapping=True   # we need this to align spans
    )

    starts, ends = [], []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        start_char = examples["answer_start"][i]
        end_char   = start_char + len(examples["answer_text"][i])

        # 2) Find token_start: first token whose span covers start_char
        token_start = 0
        while (
            token_start < len(offsets)
            and offsets[token_start][1] <= start_char
        ):
            token_start += 1

        # 3) Find token_end: last token whose span covers end_char
        token_end = len(offsets) - 1
        while (
            token_end >= 0
            and offsets[token_end][0] >= end_char
        ):
            token_end -= 1

        # 4) Clamp into valid range
        token_start = min(max(token_start, 0), len(offsets) - 1)
        token_end   = min(max(token_end,   0), len(offsets) - 1)

        starts.append(token_start)
        ends.append(token_end)

    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    tokenized.pop("offset_mapping")

    return tokenized
train_tok = train_ds.map(
    prepare_features, batched=True, remove_columns=train_ds.column_names
)
val_tok = val_ds.map(
    prepare_features, batched=True, remove_columns=val_ds.column_names
)
print(f"▶️ Train examples: {len(train_tok)}")
print(f"▶️ Val   examples: {len(val_tok)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/124 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

▶️ Train examples: 124
▶️ Val   examples: 124


In [3]:
import evaluate
from transformers import pipeline

# 1) Load metrics
bleu_metric  = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
chrf_metric  = evaluate.load("chrf")


def compute_metrics(_):
    preds, refs = [], []
    for ex in val_ds:
        out  = qa_pipe({"question": ex["question"], "context": ex["context"]})
        pred = out["answer"].strip()
        ref  = ex["answer_text"].strip()
        preds.append(pred)
        refs.append(ref)

    # BLEU: predictions=list[str], references=list[list[str]]
    bleu = bleu_metric.compute(
        predictions=preds,
        references=[[r] for r in refs]
    )["bleu"]

    # ROUGE: raw strings → returns dict of floats
    rouge_scores = rouge_metric.compute(
        predictions=preds,
        references=refs,
        use_stemmer=True
    )
    rouge_l = rouge_scores["rougeL"]  # already a float

    # chrF: raw strings
    chrf = chrf_metric.compute(
        predictions=preds,
        references=refs
    )["score"]

    return {
        "bleu":   bleu,
        "rougeL": rouge_l,
        "chrf":   chrf,
    }


In [4]:
# !pip install sacrebleu

In [21]:
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
import torch

model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")
training_args = TrainingArguments(
    output_dir="./roberta-qa-model",
    overwrite_output_dir=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",

    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=30,
    learning_rate=1e-4,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
qa_pipe = pipeline(
    "question-answering",
    model=trainer.model,
    tokenizer=trainer.tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Rougel,Chrf
1,2.185,1.951668,0.0,0.105063,10.527973
2,2.1497,6.030652,0.0,0.039891,7.203433
3,2.5186,3.175674,0.0,0.005376,1.000719




TrainOutput(global_step=186, training_loss=2.2844357234175487, metrics={'train_runtime': 160.4195, 'train_samples_per_second': 23.189, 'train_steps_per_second': 11.595, 'total_flos': 97202393505792.0, 'train_loss': 2.2844357234175487, 'epoch': 3.0})

In [24]:
import pandas as pd

# 1) Inspect what metrics actually landed in your log history
logs = trainer.state.log_history
df   = pd.DataFrame(logs)
print(df.columns.tolist())  # look for 'eval_bleu', 'eval_rougeL', 'eval_chrf'

# 2) Keep only epoch‐end rows
df_epoch = df[df["epoch"].notnull()]

# 3) Select the correctly-named eval metrics
metrics_df = df_epoch[[
    "epoch",
    "loss",          # training loss
    "eval_loss",     # validation loss
    "eval_bleu",     # BLEU score
    "eval_rougeL",   # ROUGE-L score
    "eval_chrf"      # chrF score
]].rename(columns={
    "loss":        "train_loss",
    "eval_loss":   "val_loss",
    "eval_bleu":   "val_bleu",
    "eval_rougeL": "val_rougeL",
    "eval_chrf":   "val_chrf",
})

# 4) Save & show
metrics_df.to_csv("epoch_textsim_metrics.csv", index=False)
metrics_df

['loss', 'grad_norm', 'learning_rate', 'epoch', 'step', 'eval_loss', 'eval_bleu', 'eval_rougeL', 'eval_chrf', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'train_runtime', 'train_samples_per_second', 'train_steps_per_second', 'total_flos', 'train_loss']


Unnamed: 0,epoch,train_loss,val_loss,val_bleu,val_rougeL,val_chrf
0,1.0,2.185,,,,
1,1.0,,1.951668,0.0,0.105063,10.527973
2,2.0,2.1497,,,,
3,2.0,,6.030652,0.0,0.039891,7.203433
4,3.0,2.5186,,,,
5,3.0,,3.175674,0.0,0.005376,1.000719
6,3.0,,,,,


In [25]:
model_dir = "roberta_f1_qa_model"
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('roberta_f1_qa_model/tokenizer_config.json',
 'roberta_f1_qa_model/special_tokens_map.json',
 'roberta_f1_qa_model/vocab.json',
 'roberta_f1_qa_model/merges.txt',
 'roberta_f1_qa_model/added_tokens.json',
 'roberta_f1_qa_model/tokenizer.json')

In [26]:
# prompt: I want to download the folders in the runtime

from google.colab import files
import zipfile

def zip_folder(folder_path, zip_filename):
  """Zips a folder and its contents."""
  with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(folder_path):
      for file in files:
        zipf.write(os.path.join(root, file),
                   os.path.relpath(os.path.join(root, file),
                                   os.path.join(folder_path, '..')))
  return zip_filename

import os
zip_filename = zip_folder("roberta_f1_qa_model", "roberta_f1_qa_model.zip")

# Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Device set to use cuda:0


{'score': 0.0005066894227638841, 'start': 1361, 'end': 1370, 'answer': 'Alexander'}
