<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [34]:
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [36]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback, pipeline
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Constants**

In [3]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "LOGS"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [4]:
BATCH_TIMESTAMP = ""
def timestamp():
  return datetime.datetime.now().strftime("%Y-%m-%d_%H")

def get_output_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{OUTPUT_DIRECTORY}"

def get_logs_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{LOGS_DIRECTORY}"

def get_model_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{MODEL_DIRECTORY}"

def get_tokenizer_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{TOKENIZER_DIRECTORY}"

In [5]:
timestamp()

'2025-02-25_01'

# **Loading Dataset**

## Access dataset

In [6]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.arrow:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

validation.arrow:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

test.arrow:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19340 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2763 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5526 [00:00<?, ? examples/s]

In [6]:
dataset["train"][120]

{'id': '01529-002',
 'article_id': 1529,
 'article_title': 'Tourist van nahulog kay driver nakatulog',
 'article_body': 'Nangalandig sa emergency room sa Badian District Hospital sa Brgy. Poblacion, Badian, habagatang Sugbo, ang upat ka mga turista ug drayber sa van nga ilang gisakyan human naaksidente sa alas 3:40 sa kaadlawon sa Biyernes, Nobiyembre 17, 2023, sa Brgy. Poblacion. Ang drayber nakatulog kay hayan lapoy pa kini sa iyang kapin sa 100 ka kilometro nga biyahe sa amihanang Sugbo. Hinuon minor injuries lang ang naangkon sa mga biktima busa nakagawas ra dayon sa ospital human matambali ug mahiling. Basi sa nakuhang kasayuran sa Superbalita sa Cebu gikan sa kasaligang tinubdan, nailhan ang mga biktima nga turista nga puro taga San Antonio, Tondo, Manila, nga sila si Antonietto Avila Libunao, 64, minyo; iyang asawa nga si Carmen Pacione; Lorence Pacis Paclibon , 40, minyo; ug anak niini nga si Pacomios Pacis Paclibon, 5. Samtang ang drayber nga naangol giila nga si Emeniano Jorg

In [7]:
# Initialize variables to track the longest article
longest_article = None
max_length = 0

# Iterate through each article in the train dataset
for article in dataset["train"]:
    # Concatenate article_body and context
    combined_text = article["article_body"] + article["question"]

    # Calculate the length of the combined text
    combined_length = len(combined_text)

    # Update if this article is the longest found so far
    if combined_length > max_length:
        max_length = combined_length
        longest_article = article

# Print the longest article and its length
print(f"Longest combined article length: {max_length}")
print(f"Longest article: {longest_article}")


Longest combined article length: 5911
Longest article: {'id': '00127-003', 'article_id': 127, 'article_title': 'Senado tensiyonado atol sa pag-imbestigar ni Balderas', 'article_body': 'Puno sa tensyon atol sa imbestigasyon sa Senado sa gi-raid nga ilegal nga Pogo hub niadtong Lunes, Septiyembre 9, 2024, tungod kay ang mga magbabalaod nangasuko ni dismissed Bamban, Tarlac Mayor Aretha Balderas, kinsa nagdumili sa pagtubag sa ilang mga pangutana. Samtang ang mga magbabalaod nangutana kung giunsa niya ug ang pipila ka mga miyembro sa iyang pamilya mibiya sa nasod, si Balderas nagdumili sa paghingalan sa tawo nga nagpahigayon sa ilang pag-ikyas, tungod sa kahadlok sa iyang kinabuhi. Gisuwat hinuon niya ang ngalan sa tawo sa usa ka papel sa hangyo ni Senate President Pro Tempore Stanford Baldomar. Gihangyo ni Balderas ang mga magbabalaod nga dili isulti og kusog ang ngalan. "Do not tell the senators what to do with the information. Pinagbibigyan ka namin isulat sa papel," matod ni Senate Co

# **Prepare Dataset**

## Prepare tokenizer

In [8]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
# special_tokens = {"additional_special_tokens": ["<q>", "<c>"]}

# # Add the new tokens to the tokenizer's vocabulary
# tokenizer.add_special_tokens(special_tokens)

# special_tokens = tokenizer.special_tokens_map

# Print each special token and its corresponding ID
# for token_name, token_value in special_tokens.items():
#     token_id = tokenizer.convert_tokens_to_ids(token_value)
#     print(f"{token_name}: '{token_value}' -> {token_id}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
print(tokenizer.model_max_length)


512


## Tokenize

In [9]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]

# print(len(dataset["train"]))
# print(len(dataset.filter(filter_incomplete_examples)["train"]))
# print(len(dataset["test"]))
# print(len(dataset.filter(filter_incomplete_examples)["test"]))
# print(len(dataset["validation"]))
# print(len(dataset.filter(filter_incomplete_examples)["validation"]))

In [10]:
# # Tokenize function
# def tokenize_function(data):
#     input = f"<q>{data['question']}<c>{data['context']['text']}"
#     return tokenizer(input, padding="max_length", truncation=True)

# # Apply tokenization to the dataset
# tokenized_dataset = dataset.map(tokenize_function, batched=False)

def tokenize_function(examples):
    context_text = [context.get("text", "") for context in examples.get("context", [{}])]
    question_text = examples.get("question", [""])

    tokenized_examples = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

    # Lists to store calculated start and end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        start_token = 0
        end_token = 0
        sample_index = sample_mapping[i]
        answer = examples["answer"][sample_index]

        # Handle missing or empty answers
        if len(answer["text"]) == 0:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Get the answer's start and end character positions
        start_char = answer["start"]
        end_char = answer["end"]

        # Get the sequence IDs to identify the context part
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Identify the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # Check if the answer is out of the bounds of the context
        if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Find start and end tokens for the answer
        start_token = next(
            (idx for idx, offset in enumerate(offsets)
            if offset[0] <= start_char <= offset[1]),
            None
        )
        end_token = next(
            (idx for idx, offset in enumerate(offsets)
            if offset[0] <= end_char <= offset[1]),
            None
        )

        if start_token is None:
            raise ValueError("Start character position not found in token offsets.")

        if end_token is None:
            raise ValueError("Start character position not found in token offsets.")

        start_positions.append(start_token)
        end_positions.append(end_token)

    # Add start and end positions to the tokenized examples
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Tokenize the dataset
tokenized_dataset = dataset.filter(filter_incomplete_examples).map(tokenize_function, batched=True)


Filter:   0%|          | 0/19340 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2763 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5526 [00:00<?, ? examples/s]

Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

Map:   0%|          | 0/5526 [00:00<?, ? examples/s]

In [11]:
tokenized_dataset["train"].features

{'id': Value(dtype='string', id=None),
 'article_id': Value(dtype='int64', id=None),
 'article_title': Value(dtype='string', id=None),
 'article_body': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'context': {'end': Value(dtype='int64', id=None),
  'start': Value(dtype='int64', id=None),
  'text': Value(dtype='string', id=None)},
 'answer': {'end': Value(dtype='int64', id=None),
  'start': Value(dtype='int64', id=None),
  'text': Value(dtype='string', id=None)},
 'context_start': Value(dtype='int64', id=None),
 'context_end': Value(dtype='int64', id=None),
 'answer_start': Value(dtype='int64', id=None),
 'answer_end': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'offset_mapping': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'overflow_to_

## Dataset Splitting

In [12]:
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]

train_dataset

Dataset({
    features: ['id', 'article_id', 'article_title', 'article_body', 'question', 'context', 'answer', 'context_start', 'context_end', 'answer_start', 'answer_end', 'input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
    num_rows: 19340
})

# **Model Training**

## Load Pre-Trained RoBERTa

In [13]:
model = XLMRobertaForQuestionAnswering.from_pretrained("xlm-roberta-base")


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Early Stopping

In [14]:
# Early stopping parameters
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations with no improvement before stopping
    early_stopping_threshold=0.0  # Minimum change in the metric to qualify as an improvement
)

## Training Argument

In [15]:
BATCH_TIMESTAMP = timestamp()
training_args = TrainingArguments(
    output_dir = get_output_directory(),
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir=get_logs_directory(),
    logging_steps=10,
    save_total_limit=2,
)

## Trainer

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping_callback]
)


## Saving the fine-tuned model

In [17]:
trainer.train()
model.save_pretrained(get_model_directory())
tokenizer.save_pretrained(get_output_directory())



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjtlagumbay[0m ([33mjtlagumbay-university-of-the-philippines[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,2.2811,2.235628
2,1.7748,1.860058
3,1.7744,1.698218


KeyboardInterrupt: 

In [21]:
model.save_pretrained(get_model_directory())
tokenizer.save_pretrained(get_tokenizer_directory())

('/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr/2025-02-25_01/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr/2025-02-25_01/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr/2025-02-25_01/tokenizer/sentencepiece.bpe.model',
 '/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr/2025-02-25_01/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr/2025-02-25_01/tokenizer/tokenizer.json')

# **Evaluating the model**

## Evaluating

In [38]:
metric_f1 = load("squad")
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Extract text answers
    pred_answers = [p["answer"] for p in predictions]
    true_answers = [a["text"][0] if len(a["text"]) > 0 else "" for a in labels]

    # Compute F1 and Exact Match (EM)
    results = metric_f1.compute(predictions=pred_answers, references=true_answers)

    # Compute Sentence Match (exact string match)
    sentence_match = np.mean([pred == true for pred, true in zip(pred_answers, true_answers)])

    results["sentence_match"] = sentence_match
    return results

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [39]:
results = trainer.evaluate()
eval_results = trainer.evaluate()
custom_metrics = compute_metrics(eval_results)

# Print results
print("Evaluation Results:")
print(f"F1 Score: {custom_metrics['f1']:.4f}")
print(f"Exact Match (EM): {custom_metrics['exact_match']:.4f}")
print(f"Sentence Match: {custom_metrics['sentence_match']:.4f}")


Epoch,Training Loss,Validation Loss
1,2.2811,2.235628
2,1.7748,1.860058
3,1.7035,1.730978


ValueError: not enough values to unpack (expected 2, got 1)

## Inference

In [22]:
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [31]:
test_dataset = dataset["test"].shuffle(seed=60).select(range(50))

# List to store results
results_list = []

# Iterate through each sample in test_dataset
for sample in test_dataset:
    question = sample["question"]
    context = sample["article_body"]
    expected_answer = sample["answer"]["text"] if sample["answer"]["text"] else "N/A"  # Handle empty answers

    # Get model prediction
    model_output = qa_pipeline(question=question, context=context)
    actual_answer = model_output["answer"]

    # Append results
    results_list.append({
        "Question": question,
        "Expected Answer": expected_answer,
        "Actual Answer": actual_answer
    })

# Convert to DataFrame for better visualization
df = pd.DataFrame(results_list)

# Display as a table
display(df)

Unnamed: 0,Question,Expected Answer,Actual Answer
0,Pila ang kita nga nakolekta sa Kapitolyo gikan...,P303 milyunes,P628 milyunes
1,Unsa ang gibuhat sa biktima sa dili pa siya gi...,nagpalit lang og ice water,nagpalit lang og ice water
2,Unsa ang gibuhat sa Mandaue City Government ar...,pag-integrate sa mental health care ngadto sa ...,"kritikal nga papel isip triage area,"
3,Kinsa ang gipanglantawan nga mobisita sa econo...,Russian President Bumagat Dioyo,Russian President Bumagat Dioyo
4,Unsa ang nahitabo kang Inchak Lydwena tungod s...,nilupad lusot sa windshield,nilupad lusot sa windshield
5,Unsang petsa ug oras nahitabo ang buy-bust sa ...,"alas 8:10 sa Biyernes Santo sa gabii, Marso 29...","Marso 29, 2024"
6,Nganong gihimo ni Baliwan ang krimen?,tungod usab sa iyang selos,tungod usab sa iyang selos
7,Unsa ang gibutyag ni Acabal bahin sa ilang kol...,ila usab nga gipauswag ang ilang koleksyon sa ...,"gipauswag ang ilang koleksyon sa basura,"
8,Kinsa ang mayor sa Lapu-Lapu?,Junard Abalos,Junard Abalos
9,Unsa ang gibalaod ni Espiras nga mahitabo sa e...,domino effect,dili na mangayo og usbaw sa plitehan.
