<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [1]:
!pip install datasets
!pip install evaluate
!pip install optuna

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from transformers.trainer_utils import get_last_checkpoint
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import optuna
import numpy as np
import unicodedata






In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Constants**

In [3]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/Shareddrives/cebqa_roberta/xlmr"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "LOGS"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [5]:
def timestamp(append):
  return datetime.datetime.now().strftime("%Y-%m-%d_%H")+"-"+str(append)

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [6]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.arrow:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

validation.arrow:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

test.arrow:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19340 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2763 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5526 [00:00<?, ? examples/s]

In [7]:
# Initialize variables to track the longest article
longest_article = None
max_length = 0

# Iterate through each article in the train dataset
for article in dataset["train"]:
    # Concatenate article_body and context
    combined_text = article["context"]["text"] + article["question"]

    # Calculate the length of the combined text
    combined_length = len(combined_text)

    # Update if this article is the longest found so far
    if combined_length > max_length:
        max_length = combined_length
        longest_article = article

# Print the longest article and its length
print(f"Longest combined article length: {max_length}")
print(f"Longest article: {longest_article}")


Longest combined article length: 1295
Longest article: {'id': '01467-009', 'article_id': 1467, 'article_title': 'Trabaho sa CBRT project sa N. Bacalso hayan malangay', 'article_body': 'Hayan malangay ang pagtrabaho sa Cebu Bus Rapid Transit (CBRT) project subay sa N. Bacalso Avenue, dakbayan sa Sugbo tungod sa pagdumili sa Traffic Management and Coordination Committee (TMCC) sa pag-aprubar sa pagbutang og mga board-up nga gikinahanglan sa pagsugod sa ilang pagtrabaho sa dapit. Ang CBRT project contractor nangayo og clearance gikan sa TMCC alang sa pagbutang og board-up gikan sa eskina sa dalan P. Aguinaldo Emelito paingon sa eskina sa dalan Leon Kilat ug N. Bacalso Avenue atol sa TMCC meeting niadtong Huwebes sa hapon, Nobiyembre 23, 2023. Ang board-up maoy gamiton sa pagkoral sa bahin sa dalan sa ilang pagtrabaho sa proyekto sa naasoy nga dapit. Si Cebu City Councilor Jeff Abayon, TMCC chairman, niingon sa usa ka pamahayag sa Biyernes, Nobiyembre 24, 2023, nga ang komite nagduha-duha 

# **Prepare Dataset**

## Prepare tokenizer

In [8]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

## Tokenize

In [9]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    return "question" in example and example["question"] and \
           "context" in example and "text" in example["context"] and \
           example["context"]["text"] and example["answer"]["text"]


In [22]:
def tokenize_train_function(examples):
    context_text = [context.get("text", "").strip() for context in examples.get("context", [{}])]
    answer_text = examples.get("answer", [{}])
    question_text = [q.strip() for q in examples.get("question", [""])]

    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answer_text[sample_idx]
        start_char = answer["start"]
        end_char = answer["start"] + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [21]:
def tokenize_validation_function(examples):
    context_text = [context.get("text", "").strip() for context in examples.get("context", [{}])]
    answer_text = examples.get("answer", [{}])
    question_text = [q.strip() for q in examples.get("question", [""])]

    start_positions = []
    end_positions = []

    inputs = tokenizer(
        question_text,
        context_text,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [12]:
def normalize_text(examples):
    examples["context"] = [
        {
            "text": unicodedata.normalize("NFKC", ctx["text"]),
            "start": ctx["start"],
            "end": ctx["end"]
        }
        for ctx in examples["context"]
    ]

    examples["answer"] = [
        {
            "text": unicodedata.normalize("NFKC", ans["text"]),
            "start": ans["start"],
            "end": ans["end"]
        }
        for ans in examples["answer"]
    ]

    examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

    return examples


In [50]:
# Clean and tokenize the dataset
tokenized_train_dataset = dataset["train"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True, remove_columns=dataset["train"].column_names)


In [53]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 19340
})

In [51]:
# Clean and tokenize the dataset
tokenized_validation_dataset = dataset["validation"].filter(filter_incomplete_examples) \
  .map(normalize_text, batched=True) \
  .map(tokenize_train_function, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

In [52]:
tokenized_validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 2762
})

In [62]:
inputs = tokenized_train_dataset[0]
start = inputs["start_positions"]
end = inputs["end_positions"]
print(start, end)
labeled_answer = tokenizer.decode(inputs["input_ids"][start : end + 1])
print(labeled_answer)

51 64
kon kanus-a o ang petsa sa pagkuha sa video


## Dataset Splitting

In [54]:
train_dataset = tokenized_train_dataset.select(range(10))
val_dataset = tokenized_validation_dataset.select(range(2))
test_dataset = dataset["train"]

print(f"train: {train_dataset.num_rows} \nval: {val_dataset.num_rows} \ntest: {test_dataset.num_rows}")

train: 10 
val: 2 
test: 19340


# **Model Training**

## Load Pre-Trained RoBERTa

In [17]:
model = XLMRobertaForQuestionAnswering.from_pretrained("xlm-roberta-base")


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Early Stopping

In [18]:
# Early stopping parameters
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

## Training Argument

In [75]:
metric = load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_preds = np.argmax(predictions[0], axis=1)
    end_preds = np.argmax(predictions[1], axis=1)

    decoded_preds = [
        tokenizer.decode(input_ids[start : end+1])
        for input_ids, start, end in zip(val_dataset["input_ids"], start_preds, end_preds)
    ]

    decoded_labels = [
        tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
        for input_ids, start, end in zip(val_dataset["input_ids"], labels[0], labels[1])
    ]

    results = metric.compute(
        predictions=[{"prediction_text": pred, "id": str(i)} for i, pred in enumerate(decoded_preds)],
        references=[{"answers": {"text": [label], "answer_start": [0]}, "id": str(i)} for i, label in enumerate(decoded_labels)]
    )

    return {
        "exact_match": results["exact_match"],
        "f1": results["f1"]
    }



In [None]:
val_dataset[0]

{'id': '01543-003',
 'article_id': 1543,
 'article_title': 'Japanese music video, nakig-collab sa Pinoy artists, midaog na sab',
 'article_body': 'ANG siyudad sa Hadano, Kanagawa Prefecture, Japan mipahigayon sa ilang 2nd Hadastragram movie contest niadtong Nobiyembre 6, 2023. Moabot ngadto sa 195 ka entry nga gisalmot diin ang "Bathroom Orchestra Instrumental" music video sa Japanese musician-film maker Jonneper Padil, a. k. a. iwapt, ang gideklarar nga grand prix champion. Ang maong music video adunay collaboration sa Filipino musician nga naglakip nilang Erlinda Leones ug Randy Lepasana, drummer sa OPM band nga Neocolours. Ang maong awit nga gi-compose ni iwapt gi-record sa Manila niadtong 2010. Ang iyang music video nag-promote sa Hadano City nga nagpakita sa inila nga underground spring water sa maong dapit. Gi-showcase sab ang ilang nature-rich parks ug observatories, sikat nila nga delicacies, ug ang dapit sa Hadano diin gipahigayon ang 2020 Tokyo Olympics.',
 'question': 'Kinsa

In [77]:
def finetune_xlmr(
    model_path = model,
    checkpoint_path = None,
    learning_rate = 1e-5,
    batch_size = 32,
    num_train_epochs = 5,
    weight_decay = 0.01
    ):
    batch_timestamp = timestamp(1)

    last_checkpoint = get_last_checkpoint(checkpoint_path) if checkpoint_path else None

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=3,
        bf16=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model_path,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train(
        resume_from_checkpoint=last_checkpoint if last_checkpoint else None
        )
    eval_results = trainer.evaluate()

    model.save_pretrained(get_model_directory(batch_timestamp))
    tokenizer.save_pretrained(get_tokenizer_directory(batch_timestamp))

    return eval_results

## Optuna

In [None]:

# Suggest values for hyperparameters in Optuna
# learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
# batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
# num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
# weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)

study = optuna.create_study(direction="maximize")
study.optimize(finetune_xlmr, n_trials=1)

# Get the best trial
best_trial = study.best_trial
# Print best trial number and its hyperparameters
print(f"Best Trial: {best_trial.number}")
print("Best Hyperparameters:", best_trial.params)
print(f"Best F1 Score: {best_trial.value:.4f}")

[I 2025-03-10 13:44:01,276] A new study created in memory with name: no-name-525be363-eb88-442a-80fd-1fb72ab389ac


2025-03-10_13-44-0


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)


Epoch,Training Loss,Validation Loss


## Start Training

In [72]:
finetune_xlmr()

Epoch,Training Loss,Validation Loss,Exact Match,F1
1,No log,5.515625,0.0,33.333333


[30 15] [54 52]
['fecture, Japan mipahigayon sa ilang 2nd Hadastragram movie contest niadtong Nobiyembre 6,', 'leyado sa katawhan?</s></s> “Kita, dili amo sa katawhan. Kita, sulugoon sa katawhan. So, kinahanglan magpakonsensya']
['Jonneper Padil', 'Kita, dili amo sa katawhan. Kita, sulugoon sa katawhan.']


[30 15] [54 52]
['fecture, Japan mipahigayon sa ilang 2nd Hadastragram movie contest niadtong Nobiyembre 6,', 'leyado sa katawhan?</s></s> “Kita, dili amo sa katawhan. Kita, sulugoon sa katawhan. So, kinahanglan magpakonsensya']
['Jonneper Padil', 'Kita, dili amo sa katawhan. Kita, sulugoon sa katawhan.']
{'eval_loss': 5.515625, 'eval_exact_match': 0.0, 'eval_f1': 33.333333333333336, 'eval_runtime': 0.0327, 'eval_samples_per_second': 61.101, 'eval_steps_per_second': 30.551, 'epoch': 1.0}


{'eval_loss': 5.515625,
 'eval_exact_match': 0.0,
 'eval_f1': 33.333333333333336,
 'eval_runtime': 0.0327,
 'eval_samples_per_second': 61.101,
 'eval_steps_per_second': 30.551,
 'epoch': 1.0}

# **Evaluating the model**

## Evaluating

In [None]:

# Function to normalize text for comparison
def normalize_text(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Function to compute F1 score
def compute_f1(pred, truth):
    pred_tokens = normalize_text(pred).split()
    truth_tokens = normalize_text(truth).split()

    # Calculate common tokens
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0:
        return 0

    # Precision and Recall
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Function to compute Exact Match (EM)
def compute_exact_match(pred, truth):
    return int(normalize_text(pred) == normalize_text(truth))

# Function to compute Sentence Match
def compute_sentence_match(pred, truth):
    pred_normalized = normalize_text(pred)
    truth_normalized = normalize_text(truth)
    return int(pred_normalized in truth_normalized or truth_normalized in pred_normalized)

## Inference

In [65]:
# try:
#   model
#   tokenizer
# except:
#   print("none")

model_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-02-25_01/model"
tokenizer_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-02-25_01/tokenizer"

model_best = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer_best = AutoTokenizer.from_pretrained(tokenizer_path)


qa_pipeline = pipeline("question-answering", model=model_best, tokenizer=tokenizer_best)


Device set to use cuda:0


In [66]:
context = "Matod ni Police Major Emeniano Don Apechi Makring, hepe sa Abellana Police Station, nga gusto nilang masayran kon kanus-a o ang petsa sa pagkuha sa video sa rugby boys, sanglit sa post niini sa Facebook account, naghisgot kini’g skywalk dapit sa Robinsons Place sa Osmeña Blvd."
query = "Unsa ang gustong masayran sa Abellana Police Station mahitungod sa video?"
results = qa_pipeline(question=query, context=context)

results

{'score': 0.48524928092956543,
 'start': 110,
 'end': 153,
 'answer': 'kon kanus-a o ang petsa sa pagkuha sa video'}

In [67]:
metric = load("squad")
answer = "kon kanus-a o ang petsa sa pagkuha sa video"
pred = {'id': '1', 'prediction_text': results['answer']}
ref = {'id': '1', "answers": {'text':[answer], 'answer_start':[0]}}
print(pred)
print(ref)
res = metric.compute(predictions=[pred], references=[ref])
print(res)

{'id': '1', 'prediction_text': 'kon kanus-a o ang petsa sa pagkuha sa video'}
{'id': '1', 'answers': {'text': ['kon kanus-a o ang petsa sa pagkuha sa video'], 'answer_start': [0]}}
{'exact_match': 100.0, 'f1': 100.0}


In [None]:
test_dataset = Dataset.from_dict({
    "question": [sample["question"] for sample in dataset["test"]],
    "context": [sample["context"]["text"] for sample in dataset["test"]]
})
model_outputs = qa_pipeline(test_dataset)

results_list = []
for sample, model_output in zip(dataset["test"], model_outputs):
  # print(sample)
    expected_answer = sample["answer"]["text"] if sample["answer"]["text"] else "N/A"  # Handle empty answers
    actual_answer = model_output["answer"]

    results_list.append({
        "Question": sample["question"],
        "Expected Answer": expected_answer,
        "Actual Answer": actual_answer
    })

for result in results_list:
    expected = result["Expected Answer"]
    actual = result["Actual Answer"]

    result["F1 Score"] = compute_f1(actual, expected)
    result["Exact Match"] = compute_exact_match(actual, expected)
    result["Sentence Match"] = compute_sentence_match(actual, expected)


# Convert to DataFrame for better visualization
df = pd.DataFrame(results_list)

# Display as a table
display(df)


Unnamed: 0,Question,Expected Answer,Actual Answer,F1 Score,Exact Match,Sentence Match
0,Kinsa ang ig-agaw nga giatake sa sawa?,si Imee Niu,Imee Niu.,0.800000,0,1
1,Unsay gibuhat ni Brent Idica pagkahuman sa ins...,dali nga nakasibat,nisumbong sa kapulisan,0.000000,0,0
2,Unsa nga mga distrito ang giila sa dakbayan?,South Road Properties (SRP) New Business and H...,South Road Properties (SRP),0.347826,0,1
3,Unsa ang giingong hinungdan sa sunog?,nagsugod sa balay sa usa ka Juling Pamugas nga...,Juling Pamugas,0.333333,0,1
4,Kanus-a mobalik sa normal nga operasyon ang mg...,kon molurang ang dautang panahon,panahon.,0.333333,0,1
...,...,...,...,...,...,...
5521,Unsa ang gibuhat sa laing taga motorbanca sa k...,ang mga sakay gidali sa pagtabang sa laing tag...,ningsalmot ra usab sa Seaborne Procession.,0.125000,0,0
5522,Unsa ang giingon ni Pagulayan nga motivo sa pa...,ginadiling drugas,anggulo sa ginadiling drugas,0.666667,0,1
5523,Asa gikan ang duha ka out-of-town contingents?,Canlaon City (sa Negros Oriental) ug lungsod s...,Canlaon City (sa Negros Oriental),0.555556,0,1
5524,Kanus-a nagsugod ug natapos ang street dancing...,"udto sa Domingo, Agusto 25, 2024 hangtod nga n...","alas 2 sa kaadlawon sa Lunes, Agusto 26, 2024,",0.533333,0,1


In [None]:
avg_f1 = df["F1 Score"].mean()
avg_em = df["Exact Match"].mean()
avg_sm = df["Sentence Match"].mean()

print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Exact Match: {avg_em:.4f}")
print(f"Average Sentence Match: {avg_sm:.4f}")

Average F1 Score: 0.5589
Average Exact Match: 0.3444
Average Sentence Match: 0.6927
