<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta_body.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [1]:
!pip install evaluate
!pip install -U datasets huggingface_hub fsspec

# !pip install optuna

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.31.2-py3-none-any.whl.metadata (13 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.31.2-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.2/484.2 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py

In [2]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
# import optuna
import numpy as np
import unicodedata
from collections import defaultdict


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Constants**

In [4]:
CEBQA_DATASET = "jhoannarica/cebquad_split"
BERT_MODEL = "xlm-roberta-large"
DRIVE_ROOT = "/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "logs"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [5]:
def timestamp():
  return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [6]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/533 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/6.10M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/11.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19300 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5597 [00:00<?, ? examples/s]

# **Prepare Dataset**

## Prepare tokenizer

## Tokenize

In [7]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained(BERT_MODEL)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

In [8]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "article_body" in example and example["answer"]

def filter_by_token_length(example):
    # Tokenize the concatenated question + article_body
    tokens = tokenizer(example["question"], example["article_body"], truncation=False)
    return len(tokens["input_ids"]) <= 512

def decode_error(example):
  input_ids = example["input_ids"]
  start_positions = example["start_positions"]
  end_positions = example["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions+1]
  return tokenizer.decode(predict_answer_tokens) == example["answer"]

In [9]:
def tokenize_train_function(examples):
    article_text = [article for article in examples.get("article_body", [""])]
    context_text = [context for context in examples.get("context", ["{}"])]
    answer_text = examples.get("answer", [""])
    answer_start = examples.get("answer_start", [0])
    context_start_list = examples.get("context_start", [0])
    question_text = [q for q in examples.get("question", [""])]
    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        article_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=False,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    # sample_map = inputs.pop("overflow_to_sample_mapping")

    for i, offset in enumerate(offset_mapping):
        answer = answer_text[i]
        context = context_text[i]
        article = article_text[i]
        start_char = int(context_start_list[i]) + int(answer_start[i])
        end_char = start_char + len(answer)


        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions


    return inputs


In [10]:
def normalize_text(examples):
    examples["context"] = [unicodedata.normalize("NFKC", context) for context in examples["context"]]

    examples["article_body"] = [unicodedata.normalize("NFKC", body) for body in examples["article_body"]]

    examples["answer"] =  [unicodedata.normalize("NFKC", answer) for answer in examples["answer"]]

    examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

    return examples


In [11]:
dataset["train"]

Dataset({
    features: ['id', 'article_id', 'article_title', 'article_body', 'question', 'context', 'answer', 'context_start', 'context_end', 'answer_start', 'answer_end'],
    num_rows: 19300
})

In [12]:
# Clean and tokenize the dataset
tokenized_train_dataset = dataset["train"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)



Filter:   0%|          | 0/19300 [00:00<?, ? examples/s]

Map:   0%|          | 0/19300 [00:00<?, ? examples/s]

Map:   0%|          | 0/19300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19300 [00:00<?, ? examples/s]

In [13]:
# Clean and tokenize the dataset
tokenized_validation_dataset = dataset["validation"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Filter:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2732 [00:00<?, ? examples/s]

In [14]:
# Clean and tokenize the dataset
tokenized_test_dataset = dataset["test"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True)\
  .filter(decode_error)


Filter:   0%|          | 0/5597 [00:00<?, ? examples/s]

Map:   0%|          | 0/5596 [00:00<?, ? examples/s]

Map:   0%|          | 0/5596 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5596 [00:00<?, ? examples/s]

In [15]:
print(tokenized_train_dataset[0])
print(tokenized_train_dataset[1])
print(tokenized_train_dataset[2])
print(tokenized_train_dataset[3])

{'id': '00022-001', 'article_id': 22, 'article_title': 'Brody Abando, Cedie Abangan ‘gitaktak’ sa ilang serbisyo', 'article_body': 'Ang buhatan sa Ombudsman mi-dismiss sa mayor sa Mandaue City ug Cebu City sa ilang serbisyo. Si Mandaue City Mayor Brody Abando gitaktak sa iyang serbisyo human nasuta nga sad-an sa kasong grave misconduct sa dihang mitugot nga makapadayon sa operasyon ang usa ka cement batching plant nga way business ug environmental permits. Samtang si suspended Cebu City Mayor Cedie Abangan, nasuta sab sa Ombudsman nga sad-an sa nepotism ug grave misconduct human gitugotan ang duha ka igsuon sa iyang asawa nga makatrabaho sa Cebu City Hall. Ang kaso ni Abando base sa reklamo nga giduso niadtong Oktubre 2022 sa dihang ang mga reklamante nga sila si Princesa Acuba Acuna ug Lita Ada mibutyag nga way gihimo ang mayor ngadto sa mga nahimong kalapasan sa Suprea Phils. Development Corp. ’s nga matod pa naghatag og peligro sa panglawas ug kinaiyahan. Ang planta sa Suprea nga na

In [16]:
print(f"train: {len(tokenized_train_dataset)} validate: {len(tokenized_validation_dataset)} test: {len(tokenized_test_dataset)} ")

train: 14436 validate: 2025 test: 4168 


## Dataset Splitting

In [17]:
train_dataset = tokenized_train_dataset
val_dataset = tokenized_validation_dataset
test_dataset = tokenized_test_dataset

print(f"train: {train_dataset.num_rows} \nval: {val_dataset.num_rows} \ntest: {test_dataset.num_rows}")
# train_dataset[0]
# len(val_dataset)


train: 14436 
val: 2025 
test: 4168


In [18]:
def test_decode(dataset, idx):
  input_ids = dataset[idx]["input_ids"]
  start_positions = dataset[idx]["start_positions"]
  end_positions = dataset[idx]["end_positions"]
  predict_answer_tokens = input_ids[start_positions : end_positions+1]
  return tokenizer.decode(predict_answer_tokens), dataset[idx]["answer"]

# idx = 12
# print(test_decode(train_dataset, idx))
# print(test_decode(val_dataset, idx))
error_id = []
for idx, train in enumerate(val_dataset.select(range(10))):
    decoded, orig = test_decode(val_dataset, idx)
    print(decoded)
    print(orig)
    # if decoded != orig:
        # print(f"idx: {train_dataset[idx]['id']}")
        # print(val_dataset[idx]['id'])
        # print(val_dataset[idx]['start_positions'], val_dataset[idx]['end_positions'])
        # if not (val_dataset[idx]['start_positions'] == 0 and val_dataset[idx]['end_positions']== 0):
        #     error_id.append(val_dataset[idx]['id'])

    # if idx == 100:
    #   break
# print(len(error_id))

# error_id = []
# for idx, train in enumerate(train_dataset):
#     decoded, orig = test_decode(train_dataset, idx)
#     if decoded != orig:
#         # print(f"idx: {train_dataset[idx]['id']}")
#         # print(val_dataset[idx]['id'])
#         # print(val_dataset[idx]['start_positions'], val_dataset[idx]['end_positions'])
#         # if not (train_dataset[idx]['start_positions'] == 0 and train_dataset[idx]['end_positions']== 0):
#         error_id.append(train_dataset[idx]['id'])

#     # if idx == 100:
#     #   break
# print(len(error_id))

Purok 7, Brgy. Africa Sur, lungsod sa Loon
Purok 7, Brgy. Africa Sur, lungsod sa Loon
Gelou Agan alyas Jomarie
Gelou Agan alyas Jomarie
Philippine Drug Enforcement Agency (PDEA) Bohol Provincial Office uban sa Bohol Maritime Police, Loon Municipal Police Station ug Coast Guard Intelligence Unit-Bohol
Philippine Drug Enforcement Agency (PDEA) Bohol Provincial Office uban sa Bohol Maritime Police, Loon Municipal Police Station ug Coast Guard Intelligence Unit-Bohol
9 ka mga putos
9 ka mga putos
Lowell Aganan, ug iyang pag-umangkon nga si Emeterio Aganan
Lowell Aganan, ug iyang pag-umangkon nga si Emeterio Aganan
Republic Act 9165 o Comprehensive Dangerous Drugs Act of 2002
Republic Act 9165 o Comprehensive Dangerous Drugs Act of 2002
Ason Agapay
Ason Agapay
Gelou Agan alyas Jomarie
Gelou Agan alyas Jomarie
12 gramos
12 gramos
Intelligence Unit
Intelligence Unit


In [19]:
print(len(error_id))

0


In [20]:
print(error_id)

[]


# **Model Training**

## Compute Metrics

In [None]:
metric = load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_preds = np.argmax(predictions[0], axis=1)
    end_preds = np.argmax(predictions[1], axis=1)

    decoded_preds = [
        tokenizer.decode(input_ids[start : end+1])
        for input_ids, start, end in zip(val_dataset["input_ids"], start_preds, end_preds)
    ]

    decoded_labels = [
        tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
        for input_ids, start, end in zip(val_dataset["input_ids"], labels[0], labels[1])
    ]
    pred = [{"prediction_text": pred, "id": str(i)} for i, pred in enumerate(decoded_preds)]
    ref = [{"answers": {"text": [label], "answer_start": [0]}, "id": str(i)} for i, label in enumerate(decoded_labels)]

    results = metric.compute(
        predictions=pred,
        references=ref
    )
    sentence_match_scores = [
        p['prediction_text'] in r['answers']['text'][0] for p, r in zip(pred, ref)
    ]

    # Compute average sentence match score
    avg_sentence_match = np.mean(sentence_match_scores)

    res = {
        "exact_match": results["exact_match"],
        "f1": results["f1"],
        "sentence_match": float(avg_sentence_match ) * 100
    }

    return res



Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

## Finetuning

In [None]:
def finetune_xlmr(
    model_path = BERT_MODEL,
    checkpoint_path = None,
    learning_rate = 1e-5,
    batch_size = 8,
    num_train_epochs = 1,
    weight_decay = 0.01
    ):
    model = XLMRobertaForQuestionAnswering.from_pretrained(model_path)

    # Early stopping parameters
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.1
    )

    batch_timestamp = timestamp()
    print(batch_timestamp)

    last_checkpoint = get_last_checkpoint(checkpoint_path) if checkpoint_path else None

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=2,
        bf16=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

      # Train and evaluate the model
    trainer.train(
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None
        )
    eval_results = trainer.evaluate()

    res_model_path = get_model_directory(batch_timestamp)
    res_tokenized_path = get_tokenizer_directory(batch_timestamp)

    model.save_pretrained(res_model_path)
    tokenizer.save_pretrained(res_tokenized_path)

    return model, tokenizer, eval_results, res_model_path, res_tokenized_path

## Optuna

In [None]:

# Suggest values for hyperparameters in Optuna
# learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
# batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
# num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
# weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)

study = optuna.create_study(direction="maximize")
study.optimize(finetune_xlmr, n_trials=1)

# Get the best trial
best_trial = study.best_trial
# Print best trial number and its hyperparameters
print(f"Best Trial: {best_trial.number}")
print("Best Hyperparameters:", best_trial.params)
print(f"Best F1 Score: {best_trial.value:.4f}")

[I 2025-03-10 13:44:01,276] A new study created in memory with name: no-name-525be363-eb88-442a-80fd-1fb72ab389ac


2025-03-10_13-44-0


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)


Epoch,Training Loss,Validation Loss


## Start Training

In [None]:
model, tokenizer, eval_results, res_model_path, res_tokenized_path = finetune_xlmr(
    num_train_epochs  = 5,
    batch_size = 16,
)
print(eval_results)
print(res_model_path)
print(res_tokenized_path)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2025-05-06_07-03




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjhoanna-bposeats[0m ([33mjhoanna-bposeats-bposeats[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Exact Match,F1,Sentence Match
1,2.1535,1.81181,37.679012,56.86319,59.111111
2,1.5557,1.376453,48.148148,68.050134,67.802469
3,1.3532,1.309681,51.555556,70.65507,68.246914
4,1.1014,1.312955,51.901235,70.942949,69.037037


Epoch,Training Loss,Validation Loss,Exact Match,F1,Sentence Match
1,2.1535,1.81181,37.679012,56.86319,59.111111
2,1.5557,1.376453,48.148148,68.050134,67.802469
3,1.3532,1.309681,51.555556,70.65507,68.246914
4,1.1014,1.312955,51.901235,70.942949,69.037037
5,0.8683,1.318668,51.901235,71.301167,69.135802


{'eval_loss': 1.3186675310134888, 'eval_exact_match': 51.901234567901234, 'eval_f1': 71.3011672171845, 'eval_sentence_match': 69.1358024691358, 'eval_runtime': 4.491, 'eval_samples_per_second': 450.899, 'eval_steps_per_second': 28.279, 'epoch': 5.0}
/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-05-06_07-03/model
/content/drive/MyDrive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-05-06_07-03/tokenizer


# **Evaluating the model**

## Normalizing predicted answer

In [21]:
def normalize_row(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

## Evaluating

In [22]:
def evaluate(model, tokenizer, dataset, use_llama = False, model_outputs=None):
  print(f"Started evaluation.")

  if model_outputs is None:
      if use_llama:

        qa_pipeline = pipeline(
            task="text-generation",
            model="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
            )

        print("Generated LLaMA text-generation pipeline.")
        print("Starting QA generation...")

        prompts = [
            f"""Answer the question based on the context.

              Context:
              {sample["article_body"]}

              Question:
              {sample["question"]}

              Answer:
              """
              for sample in dataset
        ]
        print(prompts)

        # Batched generation (can be memory-intensive)
        model_outputs = qa_pipeline(prompts, max_new_tokens=100)

      else:
        qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

        print("Generated QA Pipeline.")
        print("Starting QA Pipeline batch.")
        qa_dataset = Dataset.from_dict({
          "question": [sample["question"] for sample in dataset],
          "context": [sample["article_body"] for sample in dataset]
        })

        model_outputs = qa_pipeline(qa_dataset)

  print(f"Batched QA done. {len(model_outputs)}")
  print(f"Computing metrics.")
  print("Before pred")
  print(model_outputs[0])
  pred = [
      {
          'id': str(i+1),  # Convert ID to string
          'prediction_text': normalize_row(output['answer'])
      }
      for i, output in enumerate(model_outputs)
  ]
  print("Before ref")
  ref = [
      {
          'id': str(i+1),  # Convert ID to string
          'answers': {
              'text': normalize_row(row['answer']) if isinstance(row['answer'], list) else ([normalize_row(row['answer'])]),
              'answer_start': row['answer_start'] if isinstance(row['answer_start'], list) else [row['answer_start']]
          }
      }
      for i, row in enumerate(dataset)
  ]

  # Load SQuAD metric
  metric = load("squad")

  # Compute metric
  res = metric.compute(predictions=pred, references=ref)
  print(f"Computing metrics done.")

  # Sentence match
  sentence_match_scores = [
      p['prediction_text'] in r['answers']['text'][0] for p, r in zip(pred, ref)
  ]

  # Compute average sentence match score
  avg_sentence_match = np.mean(sentence_match_scores)

  # Combine results
  res["sentence_match"] = float(avg_sentence_match ) * 100

  return res, pred, ref, model_outputs


In [23]:
# useExisting = False
# if useExisting or (model == None and tokenizer == None):
#     print("Loading Previous")
#     model_path = DRIVE_ROOT+"/2025-04-01_05-56/model"
#     tokenizer_path = DRIVE_ROOT+"/2025-04-01_05-56/tokenizer"
#     model = AutoModelForQuestionAnswering.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

eval_results = evaluate(
    model = XLMRobertaForQuestionAnswering.from_pretrained(BERT_MODEL),
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(BERT_MODEL),
    dataset = test_dataset,
    use_llama = True
)

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started evaluation.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Device set to use cuda:0


Generated LLaMA text-generation pipeline.
Starting QA generation...


Token indices sequence length is longer than the specified maximum sequence length for this model (2050 > 2048). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


KeyboardInterrupt: 

## **Reminder**

1. Save Data to [spreadsheet](https://docs.google.com/spreadsheets/d/1Xc3-6yVMMLoXCqId-YQFyOvgQfemIQ8P2uLYC6Jszeo/edit?gid=0#gid=0)
2. Save WANDB chart to [Google Drive](https://drive.google.com/drive/u/0/folders/1inDiei-xuRlofFPJmVj8OS6pmyQdGQ2z) of the corresponding model.
3. Change runtime after.

In [None]:
res, pred, ref, model_outputs = eval_results

display(pd.DataFrame(res, index=[0]))




Unnamed: 0,exact_match,f1,sentence_match
0,0.119962,5.472313,1.247601


In [None]:
pred_answers = [ans["prediction_text"] for ans in pred]
ref_answers = [ans["answers"]["text"][0] for ans in ref]

df = pd.DataFrame({
    "Predicted Answer": pred_answers,
    "Reference Answer": ref_answers
})


display(df)

Unnamed: 0,Predicted Answer,Reference Answer
0,talisay ubos sa kamanduan sa ilang,50 gramos
1,talisay ubos sa kamanduan sa ilang,p340 000
2,talisay ubos sa kamanduan sa ilang,gammy
3,talisay ubos sa kamanduan sa ilang,25
4,talisay ubos sa kamanduan sa ilang,enan
...,...,...
4163,transport group nga sv3 niingon,land transportation franchising and regulatory...
4164,transport group nga sv3 niingon,impulse irrational thinking misjudgment poor d...
4165,transport group nga sv3 niingon,duha
4166,transport group nga sv3 niingon,elopre manilag iii
