<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [3]:
!pip install datasets
!pip install evaluate
!pip install optuna



In [4]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import optuna
import numpy as np





In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Constants**

In [6]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/Shareddrives/cebqa_roberta/xlmr"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "LOGS"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [7]:
BATCH_TIMESTAMP = ""
def timestamp(append):
  return datetime.datetime.now().strftime("%Y-%m-%d_%H")+"-"+str(append)

def get_output_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{OUTPUT_DIRECTORY}"

def get_logs_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{LOGS_DIRECTORY}"

def get_model_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{MODEL_DIRECTORY}"

def get_tokenizer_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{TOKENIZER_DIRECTORY}"

In [63]:
timestamp(2)

'2025-03-08_07-2'

# **Loading Dataset**

## Access dataset

In [8]:
dataset = load_dataset(CEBQA_DATASET)

In [9]:
dataset["train"][120]

{'id': '01529-002',
 'article_id': 1529,
 'article_title': 'Tourist van nahulog kay driver nakatulog',
 'article_body': 'Nangalandig sa emergency room sa Badian District Hospital sa Brgy. Poblacion, Badian, habagatang Sugbo, ang upat ka mga turista ug drayber sa van nga ilang gisakyan human naaksidente sa alas 3:40 sa kaadlawon sa Biyernes, Nobiyembre 17, 2023, sa Brgy. Poblacion. Ang drayber nakatulog kay hayan lapoy pa kini sa iyang kapin sa 100 ka kilometro nga biyahe sa amihanang Sugbo. Hinuon minor injuries lang ang naangkon sa mga biktima busa nakagawas ra dayon sa ospital human matambali ug mahiling. Basi sa nakuhang kasayuran sa Superbalita sa Cebu gikan sa kasaligang tinubdan, nailhan ang mga biktima nga turista nga puro taga San Antonio, Tondo, Manila, nga sila si Antonietto Avila Libunao, 64, minyo; iyang asawa nga si Carmen Pacione; Lorence Pacis Paclibon , 40, minyo; ug anak niini nga si Pacomios Pacis Paclibon, 5. Samtang ang drayber nga naangol giila nga si Emeniano Jorg

In [10]:
# Initialize variables to track the longest article
longest_article = None
max_length = 0

# Iterate through each article in the train dataset
for article in dataset["train"]:
    # Concatenate article_body and context
    combined_text = article["article_body"] + article["question"]

    # Calculate the length of the combined text
    combined_length = len(combined_text)

    # Update if this article is the longest found so far
    if combined_length > max_length:
        max_length = combined_length
        longest_article = article

# Print the longest article and its length
print(f"Longest combined article length: {max_length}")
print(f"Longest article: {longest_article}")


Longest combined article length: 5911
Longest article: {'id': '00127-003', 'article_id': 127, 'article_title': 'Senado tensiyonado atol sa pag-imbestigar ni Balderas', 'article_body': 'Puno sa tensyon atol sa imbestigasyon sa Senado sa gi-raid nga ilegal nga Pogo hub niadtong Lunes, Septiyembre 9, 2024, tungod kay ang mga magbabalaod nangasuko ni dismissed Bamban, Tarlac Mayor Aretha Balderas, kinsa nagdumili sa pagtubag sa ilang mga pangutana. Samtang ang mga magbabalaod nangutana kung giunsa niya ug ang pipila ka mga miyembro sa iyang pamilya mibiya sa nasod, si Balderas nagdumili sa paghingalan sa tawo nga nagpahigayon sa ilang pag-ikyas, tungod sa kahadlok sa iyang kinabuhi. Gisuwat hinuon niya ang ngalan sa tawo sa usa ka papel sa hangyo ni Senate President Pro Tempore Stanford Baldomar. Gihangyo ni Balderas ang mga magbabalaod nga dili isulti og kusog ang ngalan. "Do not tell the senators what to do with the information. Pinagbibigyan ka namin isulat sa papel," matod ni Senate Co

# **Prepare Dataset**

## Prepare tokenizer

In [11]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

In [24]:
print(tokenizer.model_max_length)


512


## Tokenize

In [12]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]


In [13]:
def tokenize_function(examples):
    context_text = [context.get("text", "") for context in examples.get("context", [{}])]
    question_text = examples.get("question", [""])

    tokenized_examples = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

    # Lists to store calculated start and end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        start_token = 0
        end_token = 0
        sample_index = sample_mapping[i]
        answer = examples["answer"][sample_index]

        # Handle missing or empty answers
        if len(answer["text"]) == 0:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Get the answer's start and end character positions
        start_char = answer["start"]
        end_char = answer["end"]

        # Get the sequence IDs to identify the context part
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Identify the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # Check if the answer is out of the bounds of the context
        if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Find start and end tokens for the answer
        start_token = next(
            (idx for idx, offset in enumerate(offsets)
            if offset[0] <= start_char <= offset[1]),
            None
        )
        end_token = next(
            (idx for idx, offset in enumerate(offsets)
            if offset[0] <= end_char <= offset[1]),
            None
        )

        if start_token is None:
            raise ValueError("Start character position not found in token offsets.")

        if end_token is None:
            raise ValueError("Start character position not found in token offsets.")

        start_positions.append(start_token)
        end_positions.append(end_token)

    # Add start and end positions to the tokenized examples
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Tokenize the dataset
tokenized_dataset = dataset.filter(filter_incomplete_examples).map(tokenize_function, batched=True)


Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset["train"].features

{'id': Value(dtype='string', id=None),
 'article_id': Value(dtype='int64', id=None),
 'article_title': Value(dtype='string', id=None),
 'article_body': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'context': {'end': Value(dtype='int64', id=None),
  'start': Value(dtype='int64', id=None),
  'text': Value(dtype='string', id=None)},
 'answer': {'end': Value(dtype='int64', id=None),
  'start': Value(dtype='int64', id=None),
  'text': Value(dtype='string', id=None)},
 'context_start': Value(dtype='int64', id=None),
 'context_end': Value(dtype='int64', id=None),
 'answer_start': Value(dtype='int64', id=None),
 'answer_end': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'offset_mapping': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'overflow_to_

## Dataset Splitting

In [15]:
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]

train_dataset

Dataset({
    features: ['id', 'article_id', 'article_title', 'article_body', 'question', 'context', 'answer', 'context_start', 'context_end', 'answer_start', 'answer_end', 'input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
    num_rows: 19340
})

# **Model Training**

## Load Pre-Trained RoBERTa

In [16]:
model = XLMRobertaForQuestionAnswering.from_pretrained("xlm-roberta-base")


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Early Stopping

In [17]:
# Early stopping parameters
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations with no improvement before stopping
    early_stopping_threshold=0.0  # Minimum change in the metric to qualify as an improvement
)

## Training Argument

In [18]:

# Function to normalize text for comparison
def normalize_text(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Function to compute F1 score
def compute_f1(pred, truth):
    pred_tokens = normalize_text(pred).split()
    truth_tokens = normalize_text(truth).split()

    # Calculate common tokens
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0:
        return 0

    # Precision and Recall
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Function to compute Exact Match (EM)
def compute_exact_match(pred, truth):
    return int(normalize_text(pred) == normalize_text(truth))

# Function to compute Sentence Match
def compute_sentence_match(pred, truth):
    pred_normalized = normalize_text(pred)
    truth_normalized = normalize_text(truth)
    return int(pred_normalized in truth_normalized or truth_normalized in pred_normalized)

In [19]:
def postprocess_qa_predictions(examples, start_logits, end_logits, tokenizer):
    """
    Convert model logits into readable answers.
    """
    predictions = []

    for i in range(len(start_logits)):
        start_idx = np.argmax(start_logits[i])  # Best start index
        end_idx = np.argmax(end_logits[i])  # Best end index

        # Ensure valid span
        if start_idx >= len(examples["input_ids"][i]) or end_idx >= len(examples["input_ids"][i]):
            predictions.append("")
            continue

        if start_idx > end_idx:  # If invalid prediction
            predictions.append("")
            continue

        # Decode the predicted answer
        input_ids = examples["input_ids"][i]
        answer_tokens = input_ids[start_idx : end_idx + 1]
        prediction = tokenizer.decode(answer_tokens, skip_special_tokens=True)
        predictions.append(prediction)

    return predictions

In [20]:
metric = load("squad")

def compute_metrics(eval_pred):
    # print(f"\n\neval pred")
    # print(eval_pred)
    logits, labels = eval_pred
    # print(f"\n\nlogits")
    # print(logits)
    # print(f"\n\nlabels")
    # print(labels)
    start_logits, end_logits = logits
    # print(f"\n\nstart_logits")
    # print(start_logits)
    # print(f"\n\nend_logits")
    # print(end_logits)
    # Convert logits to text predictions
    predictions = postprocess_qa_predictions(val_dataset, start_logits, end_logits, tokenizer)

    # Format references correctly
    references = [
        {"id": str(i), "answers": {"text": [data["answer"]["text"]], "answer_start": [data["answer"]["start"]]}}
        for i, data in enumerate(val_dataset.select)
    ]

    print(references)

    # Compute F1
    return metric.compute(predictions=[{"id": str(i), "prediction_text": pred} for i, pred in enumerate(predictions)], references=references)



In [100]:
val_dataset[0]

{'id': '01543-003',
 'article_id': 1543,
 'article_title': 'Japanese music video, nakig-collab sa Pinoy artists, midaog na sab',
 'article_body': 'ANG siyudad sa Hadano, Kanagawa Prefecture, Japan mipahigayon sa ilang 2nd Hadastragram movie contest niadtong Nobiyembre 6, 2023. Moabot ngadto sa 195 ka entry nga gisalmot diin ang "Bathroom Orchestra Instrumental" music video sa Japanese musician-film maker Jonneper Padil, a. k. a. iwapt, ang gideklarar nga grand prix champion. Ang maong music video adunay collaboration sa Filipino musician nga naglakip nilang Erlinda Leones ug Randy Lepasana, drummer sa OPM band nga Neocolours. Ang maong awit nga gi-compose ni iwapt gi-record sa Manila niadtong 2010. Ang iyang music video nag-promote sa Hadano City nga nagpakita sa inila nga underground spring water sa maong dapit. Gi-showcase sab ang ilang nature-rich parks ug observatories, sikat nila nga delicacies, ug ang dapit sa Hadano diin gipahigayon ang 2020 Tokyo Olympics.',
 'question': 'Kinsa

In [21]:
def objective(trial):
    BATCH_TIMESTAMP = timestamp(trial.number)
    # Suggest values for hyperparameters
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(),
        logging_steps=10,
        save_total_limit=3,
        bf16=True  # Best for A100
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)

    model.save_pretrained(get_model_directory())
    tokenizer.save_pretrained(get_output_directory())

    # Optimize based on F1 Score (maximize it)
    return eval_results["eval_f1"]

In [84]:
eval_results

{'eval_loss': 1.9270097017288208,
 'eval_runtime': 4.4091,
 'eval_samples_per_second': 626.435,
 'eval_steps_per_second': 19.732,
 'epoch': 3.0}

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

# Get the best trial
best_trial = study.best_trial
# Print best trial number and its hyperparameters
print(f"Best Trial: {best_trial.number}")
print("Best Hyperparameters:", best_trial.params)
print(f"Best F1 Score: {best_trial.value:.4f}")

[I 2025-03-08 10:15:14,461] A new study created in memory with name: no-name-46c06be6-4b60-466b-8d5f-8d4ff5f0d155
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjtlagumbay[0m ([33mjtlagumbay-university-of-the-philippines[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


# **Evaluating the model**

## Evaluating

## Inference

In [40]:
try:
  model
  tokenizer
except:
    model_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-02-25_01/model"
    tokenizer_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-02-25_01/tokenizer"

    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)


qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [42]:
test_dataset = dataset["test"].shuffle(seed=60).select(range(50))

# List to store results
results_list = []

# Iterate through each sample in test_dataset
for sample in test_dataset:
    question = sample["question"]
    context = sample["article_body"]
    expected_answer = sample["answer"]["text"] if sample["answer"]["text"] else "N/A"  # Handle empty answers

    # Get model prediction
    model_output = qa_pipeline(question=question, context=context)
    actual_answer = model_output["answer"]

    # Append results
    results_list.append({
        "Question": question,
        "Expected Answer": expected_answer,
        "Actual Answer": actual_answer
    })

for result in results_list:
    expected = result["Expected Answer"]
    actual = result["Actual Answer"]

    result["F1 Score"] = compute_f1(actual, expected)
    result["Exact Match"] = compute_exact_match(actual, expected)
    result["Sentence Match"] = compute_sentence_match(actual, expected)


# Convert to DataFrame for better visualization
df = pd.DataFrame(results_list)

# Display as a table
display(df)

Unnamed: 0,Question,Expected Answer,Actual Answer,F1 Score,Exact Match,Sentence Match
0,Pila ang kita nga nakolekta sa Kapitolyo gikan...,P303 milyunes,P628 milyunes,0.5,0,0
1,Unsa ang gibuhat sa biktima sa dili pa siya gi...,nagpalit lang og ice water,"Rhamsontal Eprol Labor Melgarejo,",0.0,0,0
2,Unsa ang gibuhat sa Mandaue City Government ar...,pag-integrate sa mental health care ngadto sa ...,community-based mental health initiative,0.210526,0,0
3,Kinsa ang gipanglantawan nga mobisita sa econo...,Russian President Bumagat Dioyo,children’s hospital,0.0,0,0
4,Unsa ang nahitabo kang Inchak Lydwena tungod s...,nilupad lusot sa windshield,nilupad lusot sa windshield,1.0,1,1
5,Unsang petsa ug oras nahitabo ang buy-bust sa ...,"alas 8:10 sa Biyernes Santo sa gabii, Marso 29...",alas 10:55 sa Miyerkules Santo,0.470588,0,0
6,Nganong gihimo ni Baliwan ang krimen?,tungod usab sa iyang selos,tungod usab sa iyang selos,1.0,1,1
7,Unsa ang gibutyag ni Acabal bahin sa ilang kol...,ila usab nga gipauswag ang ilang koleksyon sa ...,nagbutang na sila og pukot sa mga utlanan sa sapa,0.275862,0,0
8,Kinsa ang mayor sa Lapu-Lapu?,Junard Abalos,Junard Abalos,1.0,1,1
9,Unsa ang gibalaod ni Espiras nga mahitabo sa e...,domino effect,dili na mangayo og usbaw sa plitehan.,0.0,0,0


In [43]:
avg_f1 = df["F1 Score"].mean()
avg_em = df["Exact Match"].mean()
avg_sm = df["Sentence Match"].mean()

print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Exact Match: {avg_em:.4f}")
print(f"Average Sentence Match: {avg_sm:.4f}")

Average F1 Score: 0.4332
Average Exact Match: 0.3200
Average Sentence Match: 0.4000
