<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta_ceb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa-ceb

# **Libraries**

In [1]:
!pip install datasets
!pip install evaluate
!pip install -U datasets huggingface_hub fsspec


Collecting fsspec
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)


In [89]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer, RobertaPreTrainedModel, RobertaModel, RobertaTokenizerFast, BertPreTrainedModel, BertModel, BertTokenizerFast
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import numpy as np
import unicodedata
import torch
import torch.nn as nn

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Constants**

In [75]:
CEBQA_DATASET = "jhoannarica/cebquad_split"
DRIVE_ROOT = "/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new_split/roberta_ceb"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "logs"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"
MODEL_NAME = "dost-asti/BERT-ceb-cased"

# **Utils**

In [5]:
def timestamp():
  return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [76]:
dataset = load_dataset(CEBQA_DATASET)

# **Prepare Dataset**

## Prepare tokenizer

In [77]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/569k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/249k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/76.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

In [78]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and \
           example["context"] and example["answer"]

def decode_error(example):
  input_ids = example["input_ids"]
  start_positions = example["start_positions"]
  end_positions = example["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions+1]
  return tokenizer.decode(predict_answer_tokens).strip() == example["answer"]

In [80]:
def tokenize_train_function(examples):
    context_text = [context for context in examples.get("context", [{}])]
    answer_text = examples.get("answer", [{}])
    question_text = [q.strip() for q in examples.get("question", [""])]
    answer_start = examples.get("answer_start", [0])
    context_start_list = examples.get("context_start", [0])
    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        context_text,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answer_text[sample_idx]
        start_char = answer_start[sample_idx]
        end_char = answer_start[sample_idx] + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [81]:
def normalize_text(examples):
    examples["context"] = [unicodedata.normalize("NFKC", context) for context in examples["context"]]

    examples["article_body"] = [unicodedata.normalize("NFKC", body) for body in examples["article_body"]]

    examples["answer"] =  [unicodedata.normalize("NFKC", answer) for answer in examples["answer"]]

    examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

    return examples


In [82]:
# Clean and tokenize the dataset
tokenized_train_dataset = dataset["train"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Map:   0%|          | 0/19300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19300 [00:00<?, ? examples/s]

In [83]:
# Clean and tokenize the dataset
tokenized_validation_dataset = dataset["validation"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2732 [00:00<?, ? examples/s]

In [84]:
# Clean and tokenize the dataset
tokenized_test_dataset = dataset["test"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Map:   0%|          | 0/5596 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5596 [00:00<?, ? examples/s]

## Dataset Splitting

In [85]:
train_dataset = tokenized_train_dataset
val_dataset = tokenized_validation_dataset
test_dataset = tokenized_test_dataset

print(f"train: {train_dataset.num_rows} \nval: {val_dataset.num_rows} \ntest: {test_dataset.num_rows}")

train: 18221 
val: 2577 
test: 5291


In [86]:
def test_decode(dataset, idx):
  input_ids = dataset[idx]["input_ids"]
  start_positions = dataset[idx]["start_positions"]
  end_positions = dataset[idx]["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions + 1]
  return tokenizer.decode(predict_answer_tokens).strip(), dataset[idx]["answer"]

idx = 1
print(test_decode(train_dataset, idx))
print(test_decode(val_dataset, idx))

('grave misconduct', 'grave misconduct')
('Gelou Agan alyas Jomarie', 'Gelou Agan alyas Jomarie')


# **Model Training**

## Modifying the model

In [87]:
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = RobertaModel(config)  # Load BERT encoder
        self.qa_outputs = nn.Linear(config.hidden_size, 2)  # Output layer (2 values: start and end logits)

        self.init_weights()  # Initialize weights

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, start_positions=None, end_positions=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state  # Get contextual embeddings

        logits = self.qa_outputs(sequence_output)  # Pass embeddings through linear layer
        start_logits, end_logits = logits.split(1, dim=-1)  # Split into start and end logits
        start_logits = start_logits.squeeze(-1)  # Remove extra dimension
        end_logits = end_logits.squeeze(-1)

        loss = None
        if start_positions is not None and end_positions is not None:
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            loss = (start_loss + end_loss) / 2  # Average the loss

        return {"loss": loss, "start_logits": start_logits, "end_logits": end_logits}


In [90]:
class BertForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)  # Load BERT encoder
        self.qa_outputs = nn.Linear(config.hidden_size, 2)  # Output layer (2 values: start and end logits)

        self.init_weights()  # Initialize weights

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, start_positions=None, end_positions=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state  # Get contextual embeddings

        logits = self.qa_outputs(sequence_output)  # Pass embeddings through linear layer
        start_logits, end_logits = logits.split(1, dim=-1)  # Split into start and end logits
        start_logits = start_logits.squeeze(-1)  # Remove extra dimension
        end_logits = end_logits.squeeze(-1)

        loss = None
        if start_positions is not None and end_positions is not None:
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            loss = (start_loss + end_loss) / 2  # Average the loss

        return {"loss": loss, "start_logits": start_logits, "end_logits": end_logits}


## Compute Metrics

In [91]:
metric = load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_preds = np.argmax(predictions[0], axis=1)
    end_preds = np.argmax(predictions[1], axis=1)

    decoded_preds = [
        tokenizer.decode(input_ids[start : end+1])
        for input_ids, start, end in zip(val_dataset["input_ids"], start_preds, end_preds)
    ]

    decoded_labels = [
        tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
        for input_ids, start, end in zip(val_dataset["input_ids"], labels[0], labels[1])
    ]

    pred = [{"prediction_text": pred.strip(), "id": str(i)} for i, pred in enumerate(decoded_preds)]
    ref = [{"answers": {"text": [label.strip()], "answer_start": [0]}, "id": str(i)} for i, label in enumerate(decoded_labels)]

    results = metric.compute(
        predictions=pred,
        references=ref
    )

    sentence_match_scores = [
        p['prediction_text'].strip() in r['answers']['text'][0].strip() for p, r in zip(pred, ref)
    ]

    avg_sentence_match = np.mean(sentence_match_scores)


    res = {
        "exact_match": results["exact_match"],
        "f1": results["f1"],
        "sentence_match": float(avg_sentence_match ) * 100
    }

    print(res)
    return res



## Finetuning

In [92]:
def finetune_xlmr(
    model_path = MODEL_NAME,
    checkpoint_path = None,
    learning_rate = 1e-5,
    batch_size = 16,
    num_train_epochs = 10,
    weight_decay = 0.01
    ):
    model = BertForQuestionAnswering.from_pretrained(model_path)

    # Early stopping parameters
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.1
    )

    batch_timestamp = timestamp()
    print(batch_timestamp)

    last_checkpoint = get_last_checkpoint(checkpoint_path) if checkpoint_path else None

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=3,
        bf16=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train(
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None
        )
    print("evaluating")
    eval_results = trainer.evaluate()

    res_model_path = get_model_directory(batch_timestamp)
    res_tokenized_path = get_tokenizer_directory(batch_timestamp)

    model.save_pretrained(res_model_path)
    tokenizer.save_pretrained(res_tokenized_path)

    return model, tokenizer, eval_results, res_model_path, res_tokenized_path

## Start Training

In [100]:
model, tokenizer, eval_results, res_model_path, res_tokenized_path = finetune_xlmr(
    num_train_epochs = 2
)
print(eval_results)
print(res_model_path)
print(res_tokenized_path)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dost-asti/BERT-ceb-cased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attent

2025-05-16_14-49


Epoch,Training Loss,Validation Loss,Exact Match,F1,Sentence Match
1,2.8777,2.865024,6.092355,27.599378,32.634847
2,2.6754,2.775015,7.489329,28.289305,35.506403


{'exact_match': 6.092355452076057, 'f1': 27.599377971992077, 'sentence_match': 32.6348467209934}
{'exact_match': 7.4893286767559175, 'f1': 28.289305404230856, 'sentence_match': 35.50640279394645}
evaluating


{'exact_match': 7.4893286767559175, 'f1': 28.289305404230856, 'sentence_match': 35.50640279394645}
{'eval_loss': 2.775015115737915, 'eval_exact_match': 7.4893286767559175, 'eval_f1': 28.289305404230856, 'eval_sentence_match': 35.50640279394645, 'eval_runtime': 6.1573, 'eval_samples_per_second': 418.525, 'eval_steps_per_second': 26.31, 'epoch': 2.0}
/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new_split/roberta_ceb/2025-05-16_14-49/model
/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new_split/roberta_ceb/2025-05-16_14-49/tokenizer


# **Evaluating the model**

## Normalizing predicted answer

In [94]:
def normalize_row(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

## Evaluating

In [97]:
def evaluate(model, tokenizer, dataset, model_outputs=None):
  print(f"Started evaluation.")

  if model_outputs is None:
      qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

      print("Generated QA Pipeline.")
      print("Starting QA Pipeline batch.")
      qa_dataset = Dataset.from_dict({
        "question": [sample["question"] for sample in dataset],
        "context": [sample["article_body"] for sample in dataset]
      })

      model_outputs = qa_pipeline(qa_dataset)

  print(f"Batched QA done. {len(model_outputs)}")
  print(f"Computing metrics.")

  pred = [
      {
          'id': str(i+1),  # Convert ID to string
          'prediction_text': normalize_row(output['answer'])
      }
      for i, output in enumerate(model_outputs)
  ]

  ref = [
      {
          'id': str(i+1),  # Convert ID to string
          'answers': {
              'text': normalize_row(row['answer']) if isinstance(row['answer'], list) else ([normalize_row(row['answer'])]),
              'answer_start': row['answer_start'] if isinstance(row['answer_start'], list) else [row['answer_start']]
          }
      }
      for i, row in enumerate(dataset)
  ]

  # Load SQuAD metric
  metric = load("squad")

  # Compute metric
  res = metric.compute(predictions=pred, references=ref)
  print(f"Computing metrics done.")


  # Sentence match
  sentence_match_scores = [
      p['prediction_text'] in r['answers']['text'][0] for p, r in zip(pred, ref)
  ]

  # Compute average sentence match score
  avg_sentence_match = np.mean(sentence_match_scores)

  # Combine results
  res["sentence_match"] = float(avg_sentence_match ) * 100

  return res, pred, ref, model_outputs


In [101]:
useExisting = False
# if useExisting or (model == None and tokenizer == None):
#     print("Loading Previous")
#     model_path = DRIVE_ROOT+"/2025-03-13_03-58/model"
#     tokenizer_path = DRIVE_ROOT+"/2025-03-13_03-58/tokenizer"
#     model = model = BertForQuestionAnswering.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

eval_results = evaluate(
    model = model,
    tokenizer = tokenizer,
    dataset = test_dataset
)

Device set to use cuda:0


Started evaluation.
Generated QA Pipeline.
Starting QA Pipeline batch.




Batched QA done. 5291
Computing metrics.
Computing metrics done.


## **Reminder**

1. Save Data to [spreadsheet](https://docs.google.com/spreadsheets/d/1Xc3-6yVMMLoXCqId-YQFyOvgQfemIQ8P2uLYC6Jszeo/edit?gid=0#gid=0)
2. Save WANDB chart to [Google Drive](https://drive.google.com/drive/u/0/folders/1inDiei-xuRlofFPJmVj8OS6pmyQdGQ2z) of the corresponding model.
3. Change runtime after.

In [102]:
res, pred, ref, model_outputs = eval_results

display(pd.DataFrame(res, index=[0]))




Unnamed: 0,exact_match,f1,sentence_match
0,2.173502,7.418077,2.967303


In [68]:
pred_answers = [ans["prediction_text"] for ans in pred]
ref_answers = [ans["answers"]["text"][0] for ans in ref]

df = pd.DataFrame({
    "Predicted Answer": pred_answers,
    "Reference Answer": ref_answers
})


display(df)

Unnamed: 0,Predicted Answer,Reference Answer
0,p1 000,50 gramos
1,p1 000,p340 000
2,p1 000,gammy
3,p1 000,25
4,p1 000,enan
...,...,...
5286,pitogo nakadawat og p300 000,land transportation franchising and regulatory...
5287,land transportation franchising and regulatory...,impulse irrational thinking misjudgment poor d...
5288,elopre manilag iii,duha
5289,pitogo nakadawat og p300 000,elopre manilag iii
