<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta_ceb_body.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa-ceb

# **Libraries**

In [None]:
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer, RobertaPreTrainedModel, RobertaModel, RobertaTokenizerFast
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import numpy as np
import unicodedata
import torch
import torch.nn as nn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Constants**

In [None]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/roberta_ceb"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "logs"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"
MODEL_NAME = "dost-asti/RoBERTa-ceb-cased"

# **Utils**

In [None]:
def timestamp():
  return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [None]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.arrow:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

validation.arrow:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

test.arrow:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19340 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2763 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5526 [00:00<?, ? examples/s]

# **Prepare Dataset**

## Prepare tokenizer

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/569k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/249k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [None]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]


In [None]:
def tokenize_train_function(examples):
    context_text = [context.get("text", "").strip() for context in examples.get("context", [{}])]
    answer_text = examples.get("answer", [{}])
    question_text = [q.strip() for q in examples.get("question", [""])]

    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answer_text[sample_idx]
        start_char = answer["start"]
        end_char = answer["start"] + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [None]:
def normalize_text(examples):
    examples["context"] = [
        {
            "text": unicodedata.normalize("NFKC", ctx["text"]),
            "start": ctx["start"],
            "end": ctx["end"]
        }
        for ctx in examples["context"]
    ]

    examples["answer"] = [
        {
            "text": unicodedata.normalize("NFKC", ans["text"]),
            "start": ans["start"],
            "end": ans["end"]
        }
        for ans in examples["answer"]
    ]

    examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

    return examples


In [None]:
# Clean and tokenize the dataset
tokenized_train_dataset = dataset["train"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)


Filter:   0%|          | 0/19340 [00:00<?, ? examples/s]

Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

In [None]:
# Clean and tokenize the dataset
tokenized_validation_dataset = dataset["validation"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)


Filter:   0%|          | 0/2763 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

## Dataset Splitting

In [None]:
train_dataset = tokenized_train_dataset
val_dataset = tokenized_validation_dataset
test_dataset = dataset["test"]

print(f"train: {train_dataset.num_rows} \nval: {val_dataset.num_rows} \ntest: {test_dataset.num_rows}")

train: 19340 
val: 2762 
test: 5526


In [None]:
def test_decode(dataset, idx):
  input_ids = dataset[idx]["input_ids"]
  start_positions = dataset[idx]["start_positions"]
  end_positions = dataset[idx]["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions + 1]
  return tokenizer.decode(predict_answer_tokens), dataset[idx]["answer"]["text"]

idx = 1
print(test_decode(train_dataset, idx))
print(test_decode(val_dataset, idx))

(' CDL', 'CDL')
('Kita, dili amo sa katawhan. Kita, sulugoon sa katawhan.', 'Kita, dili amo sa katawhan. Kita, sulugoon sa katawhan.')


# **Model Training**

## Modifying the model

In [None]:
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = RobertaModel(config)  # Load BERT encoder
        self.qa_outputs = nn.Linear(config.hidden_size, 2)  # Output layer (2 values: start and end logits)

        self.init_weights()  # Initialize weights

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, start_positions=None, end_positions=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state  # Get contextual embeddings

        logits = self.qa_outputs(sequence_output)  # Pass embeddings through linear layer
        start_logits, end_logits = logits.split(1, dim=-1)  # Split into start and end logits
        start_logits = start_logits.squeeze(-1)  # Remove extra dimension
        end_logits = end_logits.squeeze(-1)

        loss = None
        if start_positions is not None and end_positions is not None:
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            loss = (start_loss + end_loss) / 2  # Average the loss

        return {"loss": loss, "start_logits": start_logits, "end_logits": end_logits}


## Compute Metrics

In [None]:
metric = load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_preds = np.argmax(predictions[0], axis=1)
    end_preds = np.argmax(predictions[1], axis=1)

    decoded_preds = [
        tokenizer.decode(input_ids[start : end+1])
        for input_ids, start, end in zip(val_dataset["input_ids"], start_preds, end_preds)
    ]

    decoded_labels = [
        tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
        for input_ids, start, end in zip(val_dataset["input_ids"], labels[0], labels[1])
    ]

    results = metric.compute(
        predictions=[{"prediction_text": pred, "id": str(i)} for i, pred in enumerate(decoded_preds)],
        references=[{"answers": {"text": [label], "answer_start": [0]}, "id": str(i)} for i, label in enumerate(decoded_labels)]
    )

    return {
        "exact_match": results["exact_match"],
        "f1": results["f1"]
    }



Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

## Finetuning

In [None]:
def finetune_xlmr(
    model_path = MODEL_NAME,
    checkpoint_path = None,
    learning_rate = 2e-5,
    batch_size = 16,
    num_train_epochs = 10,
    weight_decay = 0.01
    ):
    model = RobertaForQuestionAnswering.from_pretrained(model_path)

    # Early stopping parameters
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.1
    )

    batch_timestamp = timestamp()
    print(batch_timestamp)

    last_checkpoint = get_last_checkpoint(checkpoint_path) if checkpoint_path else None

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=3,
        bf16=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train(
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None
        )
    eval_results = trainer.evaluate()

    res_model_path = get_model_directory(batch_timestamp)
    res_tokenized_path = get_tokenizer_directory(batch_timestamp)

    model.save_pretrained(res_model_path)
    tokenizer.save_pretrained(res_tokenized_path)

    return model, tokenizer, eval_results, res_model_path, res_tokenized_path

## Start Training

In [None]:
model, tokenizer, eval_results, res_model_path, res_tokenized_path = finetune_xlmr()
print(eval_results)
print(res_model_path)
print(res_tokenized_path)

config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/437M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at dost-asti/RoBERTa-ceb-cased and are newly initialized: ['roberta.bert.embeddings.LayerNorm.bias', 'roberta.bert.embeddings.LayerNorm.weight', 'roberta.bert.embeddings.position_embeddings.weight', 'roberta.bert.embeddings.token_type_embeddings.weight', 'roberta.bert.embeddings.word_embeddings.weight', 'roberta.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.bert.encoder.layer.0.attention.output.dense.bias', 'roberta.bert.encoder.layer.0.attention.output.dense.weight', 'roberta.bert.encoder.layer.0.attention.self.key.bias', 'roberta.bert.encoder.layer.0.attention.self.key.weight', 'roberta.bert.encoder.layer.0.attention.self.query.bias', 'roberta.bert.encoder.layer.0.attention.self.query.weight', 'roberta.bert.encoder.layer.0.attention.self.value.bias', 'roberta.bert.encoder.layer.0.attention.self.value.weight', 'r

2025-03-15_12-04


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjtlagumbay[0m ([33mjtlagumbay-university-of-the-philippines[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Exact Match,F1
1,3.0086,2.984667,6.118755,26.050375
2,2.8011,2.749356,8.291093,28.547775
3,2.4226,2.61478,12.889211,35.470159
4,2.4328,2.474117,15.206372,39.659751
5,2.0029,2.495304,17.559739,42.32937
6,1.8037,2.515841,17.813179,43.06963
7,1.7687,2.567949,17.813179,42.799845
8,1.6343,2.64655,18.175235,42.975827
9,1.5017,2.678159,17.921796,43.179835
10,1.4739,2.710049,17.958001,43.330662


{'eval_loss': 2.7100493907928467, 'eval_exact_match': 17.95800144822592, 'eval_f1': 43.33066150858458, 'eval_runtime': 6.5879, 'eval_samples_per_second': 419.257, 'eval_steps_per_second': 26.26, 'epoch': 10.0}
/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/roberta_ceb/2025-03-15_12-04/model
/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/roberta_ceb/2025-03-15_12-04/tokenizer


# **Evaluating the model**

## Normalizing predicted answer

In [None]:
def normalize_text(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

## Evaluating

In [None]:
def evaluate(model, tokenizer, dataset, model_outputs=None):
  print(f"Started evaluation.")

  if model_outputs is None:
      qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

      print("Generated QA Pipeline.")
      print("Starting QA Pipeline batch.")
      qa_dataset = Dataset.from_dict({
        "question": [sample["question"] for sample in dataset],
        "context": [sample["context"]["text"] for sample in dataset]
      })

      model_outputs = qa_pipeline(qa_dataset)

  print(f"Batched QA done. {len(model_outputs)}")
  print(f"Computing metrics.")

  pred = [
      {
          'id': str(i+1),  # Convert ID to string
          'prediction_text': normalize_text(output['answer'])
      }
      for i, output in enumerate(model_outputs)
  ]

  ref = [
      {
          'id': str(i+1),  # Convert ID to string
          'answers': {
              'text': normalize_text(row['answer']['text']) if isinstance(row['answer']['text'], list) else ([normalize_text(row['answer']['text'])]),
              'answer_start': row['answer']['start'] if isinstance(row['answer']['start'], list) else [row['answer']['start']]
          }
      }
      for i, row in enumerate(dataset)
  ]

  # Load SQuAD metric
  metric = load("squad")

  # Compute metric
  res = metric.compute(predictions=pred, references=ref)
  print(f"Computing metrics done.")

  # Sentence match
  # int(pred_normalized in truth_normalized or truth_normalized in pred_normalized)

  return res, pred, ref, model_outputs


In [None]:
useExisting = False
# if useExisting or (model == None and tokenizer == None):
#     print("Loading Previous")
#     model_path = DRIVE_ROOT+"/2025-03-13_03-58/model"
#     tokenizer_path = DRIVE_ROOT+"/2025-03-13_03-58/tokenizer"
#     model = model = BertForQuestionAnswering.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

eval_results = evaluate(
    model = model,
    tokenizer = tokenizer,
    dataset = test_dataset
)

Device set to use cuda:0


Started evaluation.
Generated QA Pipeline.
Starting QA Pipeline batch.
Batched QA done. 5526
Computing metrics.
Computing metrics done.


## **Reminder**

1. Save Data to [spreadsheet](https://docs.google.com/spreadsheets/d/1Xc3-6yVMMLoXCqId-YQFyOvgQfemIQ8P2uLYC6Jszeo/edit?gid=0#gid=0)
2. Save WANDB chart to [Google Drive](https://drive.google.com/drive/u/0/folders/1inDiei-xuRlofFPJmVj8OS6pmyQdGQ2z) of the corresponding model.
3. Change runtime after.

In [None]:
res, pred, ref, model_outputs = eval_results

display(pd.DataFrame(res, index=[0]))




Unnamed: 0,exact_match,f1
0,22.366992,49.084805


In [None]:
pred_answers = [ans["prediction_text"] for ans in pred]
ref_answers = [ans["answers"]["text"][0] for ans in ref]

df = pd.DataFrame({
    "Predicted Answer": pred_answers,
    "Reference Answer": ref_answers
})


display(df)

Unnamed: 0,Predicted Answer,Reference Answer
0,cebu sa biyernes disyembre 1 niasoy,si imee niu
1,wa na maabti,dali nga nakasibat
2,south road properties srp,south road properties srp new business and hou...
3,usa ka juling pamugas nga mikanit ngadto sa la...,nagsugod sa balay sa usa ka juling pamugas nga...
4,molurang ang dautang panahon,kon molurang ang dautang panahon
...,...,...
5521,laing taga motorbanca nga ningsalmot ra usab s...,ang mga sakay gidali sa pagtabang sa laing tag...
5522,usa ka drug personality,ginadiling drugas
5523,duha so far nga out of town contingents,canlaon city sa negros oriental ug lungsod sa ...
5524,alas 2 sa kaadlawon sa lunes agusto 26 2024,udto sa domingo agusto 25 2024 hangtod nga nat...
