<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta_ceb_body.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa-ceb

# **Libraries**

In [2]:
# !pip install datasets
# !pip install evaluate
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install accelerate transformers datasets evaluate



In [4]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer, RobertaPreTrainedModel, RobertaModel, RobertaTokenizerFast
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import numpy as np
import unicodedata
import torch
import torch.nn as nn

RuntimeError: Detected that PyTorch and torchvision were compiled with different CUDA major versions. PyTorch has CUDA Version=11.8 and torchvision has CUDA Version=12.4. Please reinstall the torchvision that matches your PyTorch install.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Constants**

In [3]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/roberta_ceb"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "logs"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"
MODEL_NAME = "dost-asti/RoBERTa-ceb-cased"

# **Utils**

In [4]:
def timestamp():
  return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [7]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# **Prepare Dataset**

## Prepare tokenizer

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

NameError: name 'RobertaTokenizerFast' is not defined

In [11]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]

def decode_error(example):
  input_ids = example["input_ids"]
  start_positions = example["start_positions"]
  end_positions = example["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions+1]
  decoded_text = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
  return decoded_text.strip() == example["answer"]["text"]

In [13]:
def tokenize_train_function(examples):
    context_text = [context.get("text", "") for context in examples.get("context", [{}])]
    context_start_list = [context.get("start", "") for context in examples.get("context", [{}])]
    article_text = [article for article in examples.get("article_body", [""])]
    answer_text = examples.get("answer", [{}])
    question_text = [q for q in examples.get("question", [""])]

    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        article_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=False,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    # sample_map = inputs.pop("overflow_to_sample_mapping")

    for i, offset in enumerate(offset_mapping):
        # sample_idx = sample_map[i]
        answer = answer_text[i]
        start_char = context_start_list[i] + answer["start"]
        end_char = start_char + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [14]:
def normalize_text(examples):
    examples["context"] = [
        {
            "text": unicodedata.normalize("NFKC", ctx["text"]),
            "start": ctx["start"],
            "end": ctx["end"]
        }
        for ctx in examples["context"]
    ]

    examples["answer"] = [
        {
            "text": unicodedata.normalize("NFKC", ans["text"]),
            "start": ans["start"],
            "end": ans["end"]
        }
        for ans in examples["answer"]
    ]

    examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

    return examples


In [15]:
# Clean and tokenize the dataset
tokenized_train_dataset = dataset["train"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

NameError: name 'unicodedata' is not defined

In [13]:
# Clean and tokenize the dataset
tokenized_validation_dataset = dataset["validation"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Filter:   0%|          | 0/2763 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2762 [00:00<?, ? examples/s]

In [14]:
# Clean and tokenize the dataset
tokenized_test_dataset = dataset["test"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Filter:   0%|          | 0/5526 [00:00<?, ? examples/s]

Map:   0%|          | 0/5526 [00:00<?, ? examples/s]

Map:   0%|          | 0/5526 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5526 [00:00<?, ? examples/s]

## Dataset Splitting

In [15]:
train_dataset = tokenized_train_dataset
val_dataset = tokenized_validation_dataset
test_dataset = tokenized_test_dataset

print(f"train: {train_dataset.num_rows} \nval: {val_dataset.num_rows} \ntest: {test_dataset.num_rows}")

train: 15735 
val: 2258 
test: 4516


In [151]:
train_dataset[0]

{'id': '00778-015',
 'article_id': 778,
 'article_title': 'Rugby boys sa PUJ, susihon',
 'article_body': 'Nakig-alayon karon ang Abellana Police Station sa netizen nga ni-upload sa video diin nakita ang grupo sa mga lalaking menor nga ningsakay sa publikong sakyanan nga dunay rota nga 17B paingon sa Brgy. Sto. Gertrudes, dakbayan sa Sugbo samtang nagsige’g simhot sa cellophane nga dunay sulod nga solvent. Matod ni Police Major Emeniano Don Apechi Makring, hepe sa Abellana Police Station, nga gusto nilang masayran kon kanus-a o ang petsa sa pagkuha sa video sa rugby boys, sanglit sa post niini sa Facebook account, naghisgot kini’g skywalk dapit sa Robinsons Place sa Osmeña Blvd. apan taudtaod na kining gi-demolish. Iyang gipasabot nga matag adlaw ang ilang police station way hunong sa pag-rescue sa mga batang libud-suroy ug gipang turnover sa barangay diin nahisakop ang dapit. “Actually kining atong concern regarding sa mga batan-on nga gagamit og kining plastic bag nga gibutang ang ill

In [145]:
def test_decode(dataset, idx):
  input_ids = dataset[idx]["input_ids"]
  start_positions = dataset[idx]["start_positions"]
  end_positions = dataset[idx]["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions + 1]
  decoded_text = tokenizer.decode(
    predict_answer_tokens,
    skip_special_tokens=True,
  )

  return decoded_text.strip(), dataset[idx]["answer"]["text"]


idx = 2
print(test_decode(train_dataset, idx))
print(test_decode(val_dataset, idx))

error_id = []
for idx, train in enumerate(train_dataset.select(range(10))):
    decoded, orig = test_decode(train_dataset, idx)
    if decoded != orig:
        print("d: " + decoded)
        print("o: " + orig)
        print("\n")
        # print(f"idx: {train_dataset[idx]['id']}")
        # print(val_dataset[idx]['id'])
        # print(val_dataset[idx]['start_positions'], val_dataset[idx]['end_positions'])
        # if not (train_dataset[idx]['start_positions'] == 0 and train_dataset[idx]['end_positions']== 0):
        error_id.append(train_dataset[idx]['id'])

    # if idx == 100:
    #   break
print(len(error_id))

('naghatag sa compound og dugang proteksyon', 'naghatag sa compound og dugang proteksyon')
('si Sylvian Barnachea', 'si Sylvian Barnachea')
d: nakatala lang sila og usa ka kaso sa
o: nakatala lang sila og usa ka kaso.


d: ) ngadto sa hotel nga giingong
o: bagyong Ferdie (Bebinca)


d: 
o: Luzon


3


In [96]:
for i, item in enumerate(tokenized_validation_dataset.select(range(5))):  # Check first 3 samples
    tokens = tokenizer.convert_ids_to_tokens(item["input_ids"])
    decoded, orig = test_decode(tokenized_validation_dataset.select(range(5)), i)
    print(decoded)
    print(orig)
    print(f"Tokens [{item['id']}]: {tokens}")


 Jonneper Padil
Jonneper Padil
Tokens [01543-003]: ['<s>', 'Kin', 'sa', 'Ġang', 'Ġnagda', 'og', 'Ġnga', 'Ġmusic', 'Ġvideo', 'Ġsa', 'ĠH', 'ad', 'astr', 'agram', 'Ġmovie', 'Ġcontest', '?', '</s>', '</s>', 'ANG', 'Ġsiyudad', 'Ġsa', 'ĠH', 'ad', 'ano', ',', 'ĠKan', 'agawa', 'ĠPre', 'fect', 'ure', ',', 'ĠJapan', 'Ġmip', 'ahigayon', 'Ġsa', 'Ġilang', 'Ġ2', 'nd', 'ĠH', 'ad', 'astr', 'agram', 'Ġmovie', 'Ġcontest', 'Ġniadtong', 'ĠNob', 'i', 'yembre', 'Ġ6', ',', 'Ġ2023', '.', 'ĠMo', 'abot', 'Ġngadto', 'Ġsa', 'Ġ19', '5', 'Ġka', 'Ġentry', 'Ġnga', 'Ġgis', 'almot', 'Ġdiin', 'Ġang', 'Ġ', '"', 'B', 'ath', 'room', 'ĠOr', 'ch', 'estra', 'ĠIn', 'str', 'um', 'ental', '"', 'Ġmusic', 'Ġvideo', 'Ġsa', 'ĠJapan', 'ese', 'Ġmusic', 'ian', '-', 'fil', 'm', 'Ġmak', 'er', 'ĠJ', 'onn', 'ep', 'er', 'ĠPad', 'il', ',', 'Ġa', '.', 'Ġk', '.', 'Ġa', '.', 'Ġiw', 'apt', ',', 'Ġang', 'Ġgideklarar', 'Ġnga', 'Ġgrand', 'Ġpr', 'ix', 'Ġchampion', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '

# **Model Training**

## Modifying the model

In [16]:
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = RobertaModel(config)  # Load BERT encoder
        self.qa_outputs = nn.Linear(config.hidden_size, 2)  # Output layer (2 values: start and end logits)

        self.init_weights()  # Initialize weights

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, start_positions=None, end_positions=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state  # Get contextual embeddings

        logits = self.qa_outputs(sequence_output)  # Pass embeddings through linear layer
        start_logits, end_logits = logits.split(1, dim=-1)  # Split into start and end logits
        start_logits = start_logits.squeeze(-1)  # Remove extra dimension
        end_logits = end_logits.squeeze(-1)

        loss = None
        if start_positions is not None and end_positions is not None:
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            loss = (start_loss + end_loss) / 2  # Average the loss

        return {"loss": loss, "start_logits": start_logits, "end_logits": end_logits}


## Compute Metrics

In [17]:
metric = load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_preds = np.argmax(predictions[0], axis=1)
    end_preds = np.argmax(predictions[1], axis=1)

    decoded_preds = [
        tokenizer.decode(input_ids[start : end+1])
        for input_ids, start, end in zip(val_dataset["input_ids"], start_preds, end_preds)
    ]

    decoded_labels = [
        tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
        for input_ids, start, end in zip(val_dataset["input_ids"], labels[0], labels[1])
    ]
    pred =[{"prediction_text": pred, "id": str(i)} for i, pred in enumerate(decoded_preds)],
    ref =[{"answers": {"text": [label], "answer_start": [0]}, "id": str(i)} for i, label in enumerate(decoded_labels)]

    results = metric.compute(
        predictions=pred,
        references=ref
    )

    sentence_match_scores = [
        p['prediction_text'] in r['answers']['text'][0] for p, r in zip(pred, ref)
    ]


    # Compute average sentence match score
    avg_sentence_match = np.mean(sentence_match_scores)

    res = {
        "exact_match": results["exact_match"],
        "f1": results["f1"],
        "sentence_match": float(avg_sentence_match ) * 100
    }

    return res



Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

## Finetuning

In [8]:
def finetune_xlmr(
    model_path = MODEL_NAME,
    checkpoint_path = None,
    learning_rate = 1e-5,
    batch_size = 16,
    num_train_epochs = 10,
    weight_decay = 0.01
    ):
    model = RobertaForQuestionAnswering.from_pretrained(model_path)

    # Early stopping parameters
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.1
    )

    batch_timestamp = timestamp()
    print(batch_timestamp)

    last_checkpoint = get_last_checkpoint(checkpoint_path) if checkpoint_path else None

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=2,
        # bf16=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train(
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None
        )
    eval_results = trainer.evaluate()

    res_model_path = get_model_directory(batch_timestamp)
    res_tokenized_path = get_tokenizer_directory(batch_timestamp)

    model.save_pretrained(res_model_path)
    tokenizer.save_pretrained(res_tokenized_path)

    return model, tokenizer, eval_results, res_model_path, res_tokenized_path

## Start Training

In [9]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! Training on GPU.")
else:
    print("CUDA is not available. Training on CPU.")

CUDA is available! Training on GPU.


In [10]:
model, tokenizer, eval_results, res_model_path, res_tokenized_path = finetune_xlmr(
    batch_size = 16,
    num_train_epochs = 10
)
print(eval_results)
print(res_model_path)
print(res_tokenized_path)

NameError: name 'RobertaForQuestionAnswering' is not defined

In [20]:
for sp, ep in zip(start_positions, end_positions):
    if sp < 0 or ep < 0 or sp >= len(inputs["input_ids"][0]) or ep >= len(inputs["input_ids"][0]):
        print(f"❌ Invalid start/end position detected: start={sp}, end={ep}, max_len={len(inputs['input_ids'][0])}")


NameError: name 'start_positions' is not defined

# **Evaluating the model**

## Normalizing predicted answer

In [None]:
def normalize_text(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

## Evaluating

In [None]:
def evaluate(model, tokenizer, dataset, model_outputs=None):
  print(f"Started evaluation.")

  if model_outputs is None:
      qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

      print("Generated QA Pipeline.")
      print("Starting QA Pipeline batch.")
      qa_dataset = Dataset.from_dict({
        "question": [sample["question"] for sample in dataset],
        "context": [sample["context"]["text"] for sample in dataset]
      })

      model_outputs = qa_pipeline(qa_dataset)

  print(f"Batched QA done. {len(model_outputs)}")
  print(f"Computing metrics.")

  pred = [
      {
          'id': str(i+1),  # Convert ID to string
          'prediction_text': normalize_text(output['answer'])
      }
      for i, output in enumerate(model_outputs)
  ]

  ref = [
      {
          'id': str(i+1),  # Convert ID to string
          'answers': {
              'text': normalize_text(row['answer']['text']) if isinstance(row['answer']['text'], list) else ([normalize_text(row['answer']['text'])]),
              'answer_start': row['answer']['start'] if isinstance(row['answer']['start'], list) else [row['answer']['start']]
          }
      }
      for i, row in enumerate(dataset)
  ]

  # Load SQuAD metric
  metric = load("squad")

  # Compute metric
  res = metric.compute(predictions=pred, references=ref)
  print(f"Computing metrics done.")

  # Sentence match
  # int(pred_normalized in truth_normalized or truth_normalized in pred_normalized)

  return res, pred, ref, model_outputs


In [None]:
useExisting = False
# if useExisting or (model == None and tokenizer == None):
#     print("Loading Previous")
#     model_path = DRIVE_ROOT+"/2025-03-13_03-58/model"
#     tokenizer_path = DRIVE_ROOT+"/2025-03-13_03-58/tokenizer"
#     model = model = BertForQuestionAnswering.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

eval_results = evaluate(
    model = model,
    tokenizer = tokenizer,
    dataset = test_dataset
)

Device set to use cuda:0


Started evaluation.
Generated QA Pipeline.
Starting QA Pipeline batch.
Batched QA done. 5526
Computing metrics.
Computing metrics done.


## **Reminder**

1. Save Data to [spreadsheet](https://docs.google.com/spreadsheets/d/1Xc3-6yVMMLoXCqId-YQFyOvgQfemIQ8P2uLYC6Jszeo/edit?gid=0#gid=0)
2. Save WANDB chart to [Google Drive](https://drive.google.com/drive/u/0/folders/1inDiei-xuRlofFPJmVj8OS6pmyQdGQ2z) of the corresponding model.
3. Change runtime after.

In [None]:
res, pred, ref, model_outputs = eval_results

display(pd.DataFrame(res, index=[0]))




Unnamed: 0,exact_match,f1
0,22.366992,49.084805


In [None]:
pred_answers = [ans["prediction_text"] for ans in pred]
ref_answers = [ans["answers"]["text"][0] for ans in ref]

df = pd.DataFrame({
    "Predicted Answer": pred_answers,
    "Reference Answer": ref_answers
})


display(df)

Unnamed: 0,Predicted Answer,Reference Answer
0,cebu sa biyernes disyembre 1 niasoy,si imee niu
1,wa na maabti,dali nga nakasibat
2,south road properties srp,south road properties srp new business and hou...
3,usa ka juling pamugas nga mikanit ngadto sa la...,nagsugod sa balay sa usa ka juling pamugas nga...
4,molurang ang dautang panahon,kon molurang ang dautang panahon
...,...,...
5521,laing taga motorbanca nga ningsalmot ra usab s...,ang mga sakay gidali sa pagtabang sa laing tag...
5522,usa ka drug personality,ginadiling drugas
5523,duha so far nga out of town contingents,canlaon city sa negros oriental ug lungsod sa ...
5524,alas 2 sa kaadlawon sa lunes agusto 26 2024,udto sa domingo agusto 25 2024 hangtod nga nat...
