<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [3]:
!pip install datasets
!pip install evaluate
!pip install optuna

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [4]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import optuna
import numpy as np
import unicodedata






In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Constants**

In [6]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/Shareddrives/cebqa_roberta/xlmr"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "LOGS"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [7]:
def timestamp(append):
  return datetime.datetime.now().strftime("%Y-%m-%d_%H")+"-"+str(append)

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [8]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.arrow:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

validation.arrow:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

test.arrow:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19340 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2763 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5526 [00:00<?, ? examples/s]

# **Prepare Dataset**

## Prepare tokenizer

## Tokenize

In [11]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]


In [12]:
def tokenize_train_function(examples):
    context_text = [context.get("text", "").strip() for context in examples.get("context", [{}])]
    answer_text = examples.get("answer", [{}])
    question_text = [q.strip() for q in examples.get("question", [""])]

    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answer_text[sample_idx]
        start_char = answer["start"]
        end_char = answer["start"] + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [14]:
def normalize_text(examples):
    examples["context"] = [
        {
            "text": unicodedata.normalize("NFKC", ctx["text"]),
            "start": ctx["start"],
            "end": ctx["end"]
        }
        for ctx in examples["context"]
    ]

    examples["answer"] = [
        {
            "text": unicodedata.normalize("NFKC", ans["text"]),
            "start": ans["start"],
            "end": ans["end"]
        }
        for ans in examples["answer"]
    ]

    examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

    return examples


In [15]:
# Clean and tokenize the dataset
tokenized_train_dataset = dataset["train"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True, remove_columns=dataset["train"].column_names)


Filter:   0%|          | 0/19340 [00:00<?, ? examples/s]

Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

In [16]:
# Clean and tokenize the dataset
tokenized_validation_dataset = dataset["validation"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True, remove_columns=dataset["train"].column_names)


Filter:   0%|          | 0/2763 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

In [17]:
print(f"train: {len(tokenized_train_dataset)} validate: {len(tokenized_validation_dataset)} ")

train: 19340 validate: 2762 


## Dataset Splitting

In [18]:
train_dataset = tokenized_train_dataset
val_dataset = tokenized_validation_dataset
test_dataset = dataset["train"]

print(f"train: {train_dataset.num_rows} \nval: {val_dataset.num_rows} \ntest: {test_dataset.num_rows}")

train: 19340 
val: 2762 
test: 19340


# **Model Training**

In [19]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

## Compute Metrics

In [20]:
metric = load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_preds = np.argmax(predictions[0], axis=1)
    end_preds = np.argmax(predictions[1], axis=1)

    decoded_preds = [
        tokenizer.decode(input_ids[start : end+1])
        for input_ids, start, end in zip(val_dataset["input_ids"], start_preds, end_preds)
    ]

    decoded_labels = [
        tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
        for input_ids, start, end in zip(val_dataset["input_ids"], labels[0], labels[1])
    ]

    results = metric.compute(
        predictions=[{"prediction_text": pred, "id": str(i)} for i, pred in enumerate(decoded_preds)],
        references=[{"answers": {"text": [label], "answer_start": [0]}, "id": str(i)} for i, label in enumerate(decoded_labels)]
    )

    return {
        "exact_match": results["exact_match"],
        "f1": results["f1"]
    }



Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [21]:
def finetune_xlmr(
    model_path = "xlm-roberta-base",
    checkpoint_path = None,
    learning_rate = 1e-5,
    batch_size = 32,
    num_train_epochs = 5,
    weight_decay = 0.01
    ):
    model = XLMRobertaForQuestionAnswering.from_pretrained(model_path)

    # Early stopping parameters
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.0
    )

    batch_timestamp = timestamp(1)

    last_checkpoint = get_last_checkpoint(checkpoint_path) if checkpoint_path else None

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=3,
        bf16=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train(
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None
        )
    eval_results = trainer.evaluate()

    model.save_pretrained(get_model_directory(batch_timestamp))
    tokenizer.save_pretrained(get_tokenizer_directory(batch_timestamp))

    return eval_results

## Optuna

In [None]:

# Suggest values for hyperparameters in Optuna
# learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
# batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
# num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
# weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)

study = optuna.create_study(direction="maximize")
study.optimize(finetune_xlmr, n_trials=1)

# Get the best trial
best_trial = study.best_trial
# Print best trial number and its hyperparameters
print(f"Best Trial: {best_trial.number}")
print("Best Hyperparameters:", best_trial.params)
print(f"Best F1 Score: {best_trial.value:.4f}")

[I 2025-03-10 13:44:01,276] A new study created in memory with name: no-name-525be363-eb88-442a-80fd-1fb72ab389ac


2025-03-10_13-44-0


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)


Epoch,Training Loss,Validation Loss


## Start Training

In [85]:
finetune_xlmr()

Epoch,Training Loss,Validation Loss,Exact Match,F1
1,1.8525,1.788063,36.314265,58.415242
2,1.431,1.289813,48.58798,71.493604
3,1.1511,1.235409,50.760319,73.991438
4,1.2369,1.200842,52.49819,75.489066
5,0.9782,1.210706,52.389573,75.175364


{'eval_loss': 1.2008416652679443,
 'eval_exact_match': 52.49818971759594,
 'eval_f1': 75.48906626377777,
 'eval_runtime': 6.2361,
 'eval_samples_per_second': 442.906,
 'eval_steps_per_second': 13.951,
 'epoch': 5.0}

# **Evaluating the model**

## Evaluating

In [24]:
def evaluate(model_path, tokenizer_path, dataset):
  print(f"Started evaluation. \nModel: {model_path} \nTokenizer: {tokenizer_path}")
  model = AutoModelForQuestionAnswering.from_pretrained(model_path)
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

  qa_pipeline = pipeline("question-answering", model=model_path, tokenizer=tokenizer_path)

  print("Generated QA Pipeline.")
  print("Starting QA Pipeline batch.")
  qa_dataset = Dataset.from_dict({
    "question": [sample["question"] for sample in dataset],
    "context": [sample["context"]["text"] for sample in dataset]
  })

  model_outputs = qa_pipeline(qa_dataset)

  print(f"Batched QA done. {len(model_outputs)}")
  print(f"Computing metrics.")

  pred = [
      {
          'id': str(i+1),  # Convert ID to string
          'prediction_text': output['answer']
      }
      for i, output in enumerate(model_outputs)
  ]

  ref = [
      {
          'id': str(i+1),  # Convert ID to string
          'answers': {
              'text': row['answer']['text'] if isinstance(row['answer']['text'], list) else [row['answer']['text']],
              'answer_start': row['answer']['start'] if isinstance(row['answer']['start'], list) else [row['answer']['start']]
          }
      }
      for i, row in enumerate(dataset)
  ]

  # Load SQuAD metric
  metric = load("squad")

  # Compute metric
  res = metric.compute(predictions=pred, references=ref)
  print(f"Computing metrics done.")

  return res, pred, ref


In [25]:
eval_results = evaluate(
    model_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-03-12_21-1/model",
    tokenizer_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-03-12_21-1/tokenizer",
    dataset = test_dataset
)

Started evaluation. 
Model: /content/drive/Shareddrives/cebqa_roberta/xlmr/2025-03-12_21-1/model 
Tokenizer: /content/drive/Shareddrives/cebqa_roberta/xlmr/2025-03-12_21-1/tokenizer


Device set to use cuda:0


Generated QA Pipeline.
Starting QA Pipeline batch.




Batched QA done. 19340
Computing metrics.
Computing metrics done.


({'exact_match': 1.8200620475698035, 'f1': 19.487980388956395},
 [{'id': '1', 'prediction_text': 'masayran'},
  {'id': '2', 'prediction_text': 'Buena giingong nagpasiugda og paluwagan'},
  {'id': '3', 'prediction_text': 'maoy naghatag'},
  {'id': '4', 'prediction_text': 'Baylon alyas Reigner, 32, ulitawo,'},
  {'id': '5', 'prediction_text': 'dakbayan sa'},
  {'id': '6', 'prediction_text': 'Tejero, dakbayan sa Sugbo,'},
  {'id': '7', 'prediction_text': 'Alexander, 24,'},
  {'id': '8', 'prediction_text': '‘Federico’,'},
  {'id': '9', 'prediction_text': 'Sept. 14,'},
  {'id': '10', 'prediction_text': 'Heredia nga kadtong mga nag-operate'},
  {'id': '11', 'prediction_text': 'Minggoy'},
  {'id': '12', 'prediction_text': 'Atty. Jay Arvin Cantal,'},
  {'id': '13', 'prediction_text': 'Brgy.'},
  {'id': '14', 'prediction_text': 'napagan lang si Merlinda Enriquez,'},
  {'id': '15',
   'prediction_text': 'og cellphone sulod sa mall sa Barangay Poblacion,'},
  {'id': '16', 'prediction_text': 'Miye