<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [55]:
!pip install datasets



In [74]:
import pandas as pd
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback
from huggingface_hub import login
import datetime
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Constants**

In [99]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa/xlmr"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "LOGS"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [98]:
BATCH_TIMESTAMP = ""
def timestamp():
  return datetime.datetime.now().strftime("%Y-%m-%d_%H")

def get_output_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{OUTPUT_DIRECTORY}"

def get_logs_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{LOGS_DIRECTORY}"

def get_model_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{MODEL_DIRECTORY}"

def get_tokenizer_directory():
  return f"{DRIVE_ROOT}/{BATCH_TIMESTAMP}/{TOKENIZER_DIRECTORY}"

In [87]:
timestamp()

'2025-02-19_07'

# **Loading Dataset**

## Access dataset

In [58]:
dataset = load_dataset(CEBQA_DATASET)

In [None]:
dataset["train"][120]

{'id': '01529-002',
 'article_id': 1529,
 'article_title': 'Tourist van nahulog kay driver nakatulog',
 'article_body': 'Nangalandig sa emergency room sa Badian District Hospital sa Brgy. Poblacion, Badian, habagatang Sugbo, ang upat ka mga turista ug drayber sa van nga ilang gisakyan human naaksidente sa alas 3:40 sa kaadlawon sa Biyernes, Nobiyembre 17, 2023, sa Brgy. Poblacion. Ang drayber nakatulog kay hayan lapoy pa kini sa iyang kapin sa 100 ka kilometro nga biyahe sa amihanang Sugbo. Hinuon minor injuries lang ang naangkon sa mga biktima busa nakagawas ra dayon sa ospital human matambali ug mahiling. Basi sa nakuhang kasayuran sa Superbalita sa Cebu gikan sa kasaligang tinubdan, nailhan ang mga biktima nga turista nga puro taga San Antonio, Tondo, Manila, nga sila si Antonietto Avila Libunao, 64, minyo; iyang asawa nga si Carmen Pacione; Lorence Pacis Paclibon , 40, minyo; ug anak niini nga si Pacomios Pacis Paclibon, 5. Samtang ang drayber nga naangol giila nga si Emeniano Jorg

In [60]:
# Initialize variables to track the longest article
longest_article = None
max_length = 0

# Iterate through each article in the train dataset
for article in dataset["train"]:
    # Concatenate article_body and context
    combined_text = article["article_body"] + article["question"]

    # Calculate the length of the combined text
    combined_length = len(combined_text)

    # Update if this article is the longest found so far
    if combined_length > max_length:
        max_length = combined_length
        longest_article = article

# Print the longest article and its length
print(f"Longest combined article length: {max_length}")
print(f"Longest article: {longest_article}")


Longest combined article length: 5911
Longest article: {'id': '00127-003', 'article_id': 127, 'article_title': 'Senado tensiyonado atol sa pag-imbestigar ni Balderas', 'article_body': 'Puno sa tensyon atol sa imbestigasyon sa Senado sa gi-raid nga ilegal nga Pogo hub niadtong Lunes, Septiyembre 9, 2024, tungod kay ang mga magbabalaod nangasuko ni dismissed Bamban, Tarlac Mayor Aretha Balderas, kinsa nagdumili sa pagtubag sa ilang mga pangutana. Samtang ang mga magbabalaod nangutana kung giunsa niya ug ang pipila ka mga miyembro sa iyang pamilya mibiya sa nasod, si Balderas nagdumili sa paghingalan sa tawo nga nagpahigayon sa ilang pag-ikyas, tungod sa kahadlok sa iyang kinabuhi. Gisuwat hinuon niya ang ngalan sa tawo sa usa ka papel sa hangyo ni Senate President Pro Tempore Stanford Baldomar. Gihangyo ni Balderas ang mga magbabalaod nga dili isulti og kusog ang ngalan. "Do not tell the senators what to do with the information. Pinagbibigyan ka namin isulat sa papel," matod ni Senate Co

# **Prepare Dataset**

## Prepare tokenizer

In [61]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
# special_tokens = {"additional_special_tokens": ["<q>", "<c>"]}

# # Add the new tokens to the tokenizer's vocabulary
# tokenizer.add_special_tokens(special_tokens)

# special_tokens = tokenizer.special_tokens_map

# Print each special token and its corresponding ID
# for token_name, token_value in special_tokens.items():
#     token_id = tokenizer.convert_tokens_to_ids(token_value)
#     print(f"{token_name}: '{token_value}' -> {token_id}")

In [None]:
print(tokenizer.model_max_length)


512


## Tokenize

In [62]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]

# print(len(dataset["train"]))
# print(len(dataset.filter(filter_incomplete_examples)["train"]))
# print(len(dataset["test"]))
# print(len(dataset.filter(filter_incomplete_examples)["test"]))
# print(len(dataset["validation"]))
# print(len(dataset.filter(filter_incomplete_examples)["validation"]))

In [63]:
# # Tokenize function
# def tokenize_function(data):
#     input = f"<q>{data['question']}<c>{data['context']['text']}"
#     return tokenizer(input, padding="max_length", truncation=True)

# # Apply tokenization to the dataset
# tokenized_dataset = dataset.map(tokenize_function, batched=False)

def tokenize_function(examples):
    context_text = [context.get("text", "") for context in examples.get("context", [{}])]
    question_text = examples.get("question", [""])

    tokenized_examples = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

    # Lists to store calculated start and end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        start_token = 0
        end_token = 0
        sample_index = sample_mapping[i]
        answer = examples["answer"][sample_index]

        # Handle missing or empty answers
        if len(answer["text"]) == 0:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Get the answer's start and end character positions
        start_char = answer["start"]
        end_char = answer["end"]

        # Get the sequence IDs to identify the context part
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Identify the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # Check if the answer is out of the bounds of the context
        if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Find start and end tokens for the answer
        start_token = next(
            (idx for idx, offset in enumerate(offsets)
            if offset[0] <= start_char <= offset[1]),
            None
        )
        end_token = next(
            (idx for idx, offset in enumerate(offsets)
            if offset[0] <= end_char <= offset[1]),
            None
        )

        if start_token is None:
            raise ValueError("Start character position not found in token offsets.")

        if end_token is None:
            raise ValueError("Start character position not found in token offsets.")

        start_positions.append(start_token)
        end_positions.append(end_token)

    # Add start and end positions to the tokenized examples
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Tokenize the dataset
tokenized_dataset = dataset.filter(filter_incomplete_examples).map(tokenize_function, batched=True)


Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

In [64]:
tokenized_dataset["train"].features

{'id': Value(dtype='string', id=None),
 'article_id': Value(dtype='int64', id=None),
 'article_title': Value(dtype='string', id=None),
 'article_body': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'context': {'end': Value(dtype='int64', id=None),
  'start': Value(dtype='int64', id=None),
  'text': Value(dtype='string', id=None)},
 'answer': {'end': Value(dtype='int64', id=None),
  'start': Value(dtype='int64', id=None),
  'text': Value(dtype='string', id=None)},
 'context_start': Value(dtype='int64', id=None),
 'context_end': Value(dtype='int64', id=None),
 'answer_start': Value(dtype='int64', id=None),
 'answer_end': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'offset_mapping': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'overflow_to_

## Dataset Splitting

In [65]:
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]

train_dataset

Dataset({
    features: ['id', 'article_id', 'article_title', 'article_body', 'question', 'context', 'answer', 'context_start', 'context_end', 'answer_start', 'answer_end', 'input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
    num_rows: 19340
})

# **Model Training**

## Load Pre-Trained RoBERTa

In [66]:
model = XLMRobertaForQuestionAnswering.from_pretrained("xlm-roberta-base")


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Early Stopping

In [75]:
# Early stopping parameters
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Number of evaluations with no improvement before stopping
    early_stopping_threshold=0.0  # Minimum change in the metric to qualify as an improvement
)

## Training Argument

In [103]:
BATCH_TIMESTAMP = timestamp()
training_args = TrainingArguments(
    output_dir = get_output_directory(),
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir=get_logs_directory(),
    logging_steps=10,
    save_total_limit=2,
)

## Trainer

In [104]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping_callback]
)


## Saving the fine-tuned model

In [None]:
trainer.train()
model.save_pretrained(get_model_directory())
tokenizer.save_pretrained(get_output_directory())

# **Evaluating the model**

## Evaluating

In [None]:
results = trainer.evaluate()
print(results)

## Inference

In [None]:
inputs = tokenizer("Your input text here", return_tensors="pt", truncation=True, padding="max_length")
outputs = model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax().item()
print(f"Predicted class: {predicted_class}")