<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/reader/cebqa_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CebQA Reader Component**
Pretrained model: RoBERTa

# **Libraries**

In [1]:
!pip install datasets
!pip install evaluate
!pip install optuna

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
import pandas as pd
from datasets import Dataset, load_dataset
from evaluate import load
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForQuestionAnswering, TrainingArguments, Trainer, XLMRobertaTokenizerFast, EarlyStoppingCallback, pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from huggingface_hub import login
import datetime
from google.colab import drive
from IPython.display import display
from sklearn.metrics import f1_score
import re
import optuna
import numpy as np





In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Constants**

In [5]:
CEBQA_DATASET = "jhoannarica/cebquad"
DRIVE_ROOT = "/content/drive/Shareddrives/cebqa_roberta/xlmr"
OUTPUT_DIRECTORY = "training_output"
LOGS_DIRECTORY = "LOGS"
MODEL_DIRECTORY = "model"
TOKENIZER_DIRECTORY = "tokenizer"

# **Utils**

In [6]:
def timestamp(append):
  return datetime.datetime.now().strftime("%Y-%m-%d_%H")+"-"+str(append)

def get_output_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{OUTPUT_DIRECTORY}"

def get_logs_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{LOGS_DIRECTORY}"

def get_model_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{MODEL_DIRECTORY}"

def get_tokenizer_directory(batch_timestamp):
  return f"{DRIVE_ROOT}/{batch_timestamp}/{TOKENIZER_DIRECTORY}"

# **Loading Dataset**

## Access dataset

In [7]:
dataset = load_dataset(CEBQA_DATASET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.arrow:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

validation.arrow:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

test.arrow:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19340 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2763 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5526 [00:00<?, ? examples/s]

In [None]:
dataset["train"][120]

{'id': '01529-002',
 'article_id': 1529,
 'article_title': 'Tourist van nahulog kay driver nakatulog',
 'article_body': 'Nangalandig sa emergency room sa Badian District Hospital sa Brgy. Poblacion, Badian, habagatang Sugbo, ang upat ka mga turista ug drayber sa van nga ilang gisakyan human naaksidente sa alas 3:40 sa kaadlawon sa Biyernes, Nobiyembre 17, 2023, sa Brgy. Poblacion. Ang drayber nakatulog kay hayan lapoy pa kini sa iyang kapin sa 100 ka kilometro nga biyahe sa amihanang Sugbo. Hinuon minor injuries lang ang naangkon sa mga biktima busa nakagawas ra dayon sa ospital human matambali ug mahiling. Basi sa nakuhang kasayuran sa Superbalita sa Cebu gikan sa kasaligang tinubdan, nailhan ang mga biktima nga turista nga puro taga San Antonio, Tondo, Manila, nga sila si Antonietto Avila Libunao, 64, minyo; iyang asawa nga si Carmen Pacione; Lorence Pacis Paclibon , 40, minyo; ug anak niini nga si Pacomios Pacis Paclibon, 5. Samtang ang drayber nga naangol giila nga si Emeniano Jorg

In [8]:
# Initialize variables to track the longest article
longest_article = None
max_length = 0

# Iterate through each article in the train dataset
for article in dataset["train"]:
    # Concatenate article_body and context
    combined_text = article["context"]["text"] + article["question"]

    # Calculate the length of the combined text
    combined_length = len(combined_text)

    # Update if this article is the longest found so far
    if combined_length > max_length:
        max_length = combined_length
        longest_article = article

# Print the longest article and its length
print(f"Longest combined article length: {max_length}")
print(f"Longest article: {longest_article}")


Longest combined article length: 1295
Longest article: {'id': '01467-009', 'article_id': 1467, 'article_title': 'Trabaho sa CBRT project sa N. Bacalso hayan malangay', 'article_body': 'Hayan malangay ang pagtrabaho sa Cebu Bus Rapid Transit (CBRT) project subay sa N. Bacalso Avenue, dakbayan sa Sugbo tungod sa pagdumili sa Traffic Management and Coordination Committee (TMCC) sa pag-aprubar sa pagbutang og mga board-up nga gikinahanglan sa pagsugod sa ilang pagtrabaho sa dapit. Ang CBRT project contractor nangayo og clearance gikan sa TMCC alang sa pagbutang og board-up gikan sa eskina sa dalan P. Aguinaldo Emelito paingon sa eskina sa dalan Leon Kilat ug N. Bacalso Avenue atol sa TMCC meeting niadtong Huwebes sa hapon, Nobiyembre 23, 2023. Ang board-up maoy gamiton sa pagkoral sa bahin sa dalan sa ilang pagtrabaho sa proyekto sa naasoy nga dapit. Si Cebu City Councilor Jeff Abayon, TMCC chairman, niingon sa usa ka pamahayag sa Biyernes, Nobiyembre 24, 2023, nga ang komite nagduha-duha 

# **Prepare Dataset**

## Prepare tokenizer

In [9]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
print(tokenizer.model_max_length)


512


## Tokenize

In [15]:
def filter_incomplete_examples(example):
    # Ensure both "question" and "context" exist and are non-empty
    # return "question" in example and example["question"] and \
    #        "context" in example and "text" in example["context"] and \
    #        example["context"]["text"] and example["answer"]["text"]

    return "question" in example and example["question"] and \
          "article_body" in example and example["article_body"] and \
          example["answer"]["text"]


In [18]:
def tokenize_function(examples):
    # context_text = [context.get("text", "") for context in examples.get("context", [{}])]
    article_body = examples.get("article_body", [""])
    question_text = examples.get("question", [""])

    tokenized_examples = tokenizer(
        question_text,
        # context_text,
        article_body,
        truncation="only_second",  # Truncate only the context
        max_length=512,            # Limit input length
        stride=128,                # Add a sliding window
        # return_overflowing_tokens=True,  # Handle long contexts
        return_offsets_mapping=True,
        padding="max_length"
    )

    # sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    # offset_mapping = tokenized_examples["offset_mapping"]

    # Lists to store calculated start and end positions
    start_positions = []
    end_positions = []

    # for i, offsets in enumerate(offset_mapping):
    for i in range(len(tokenized_examples['input_ids'])):
        start_token = 0
        end_token = 0
        answer = examples["answer"][i]

        # Handle missing or empty answers
        if len(answer["text"]) == 0:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Get the answer's start and end character positions
        start_char = answer["start"]
        end_char = answer["end"]

        # Get the sequence IDs to identify the context part
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Identify the start and end of the context
        # context_start = sequence_ids.index(1)
        # context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        article_body_start = sequence_ids.index(1)
        article_body_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # # Check if the answer is out of the bounds of the context
        # if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
        #     start_positions.append(start_token)
        #     end_positions.append(end_token)
        #     continue

        # # Find start and end tokens for the answer
        # start_token = next(
        #     (idx for idx, offset in enumerate(offsets)
        #     if offset[0] <= start_char <= offset[1]),
        #     None
        # )
        # end_token = next(
        #     (idx for idx, offset in enumerate(offsets)
        #     if offset[0] <= end_char <= offset[1]),
        #     None
        # )

        # Check if the answer is out of the bounds of the context
        if start_char < tokenized_examples.offset_mapping[i][article_body_start][0] or end_char > tokenized_examples.offset_mapping[i][article_body_end][1]:
            start_positions.append(start_token)
            end_positions.append(end_token)
            continue

        # Find start and end tokens for the answer
        start_token = next(
            (idx for idx, offset in enumerate(tokenized_examples.offset_mapping[i])
            if offset[0] <= start_char <= offset[1]),
            None
        )
        end_token = next(
            (idx for idx, offset in enumerate(tokenized_examples.offset_mapping[i])
            if offset[0] <= end_char <= offset[1]),
            None
        )

        if start_token is None:
            raise ValueError("Start character position not found in token offsets.")

        if end_token is None:
            raise ValueError("Start character position not found in token offsets.")

        start_positions.append(start_token)
        end_positions.append(end_token)

    # Add start and end positions to the tokenized examples
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Tokenize the dataset
tokenized_dataset = dataset.filter(filter_incomplete_examples).map(tokenize_function, batched=True)


Map:   0%|          | 0/19340 [00:00<?, ? examples/s]

Map:   0%|          | 0/2762 [00:00<?, ? examples/s]

Map:   0%|          | 0/5526 [00:00<?, ? examples/s]

In [19]:
tokenized_dataset["train"][0]

{'id': '00778-015',
 'article_id': 778,
 'article_title': 'Rugby boys sa PUJ, susihon',
 'article_body': 'Nakig-alayon karon ang Abellana Police Station sa netizen nga ni-upload sa video diin nakita ang grupo sa mga lalaking menor nga ningsakay sa publikong sakyanan nga dunay rota nga 17B paingon sa Brgy. Sto. Gertrudes, dakbayan sa Sugbo samtang nagsige’g simhot sa cellophane nga dunay sulod nga solvent. Matod ni Police Major Emeniano Don Apechi Makring, hepe sa Abellana Police Station, nga gusto nilang masayran kon kanus-a o ang petsa sa pagkuha sa video sa rugby boys, sanglit sa post niini sa Facebook account, naghisgot kini’g skywalk dapit sa Robinsons Place sa Osmeña Blvd. apan taudtaod na kining gi-demolish. Iyang gipasabot nga matag adlaw ang ilang police station way hunong sa pag-rescue sa mga batang libud-suroy ug gipang turnover sa barangay diin nahisakop ang dapit. “Actually kining atong concern regarding sa mga batan-on nga gagamit og kining plastic bag nga gibutang ang ill

## Dataset Splitting

In [21]:
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]

val_dataset

Dataset({
    features: ['id', 'article_id', 'article_title', 'article_body', 'question', 'context', 'answer', 'context_start', 'context_end', 'answer_start', 'answer_end', 'input_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
    num_rows: 2762
})

# **Model Training**

## Load Pre-Trained RoBERTa

In [22]:
model = XLMRobertaForQuestionAnswering.from_pretrained("xlm-roberta-base")


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Early Stopping

In [23]:
# Early stopping parameters
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations with no improvement before stopping
    early_stopping_threshold=0.0  # Minimum change in the metric to qualify as an improvement
)

## Training Argument

In [24]:

# Function to normalize text for comparison
def normalize_text(text):
    """Lowercase and remove punctuation, articles, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Function to compute F1 score
def compute_f1(pred, truth):
    pred_tokens = normalize_text(pred).split()
    truth_tokens = normalize_text(truth).split()

    # Calculate common tokens
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0:
        return 0

    # Precision and Recall
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Function to compute Exact Match (EM)
def compute_exact_match(pred, truth):
    return int(normalize_text(pred) == normalize_text(truth))

# Function to compute Sentence Match
def compute_sentence_match(pred, truth):
    pred_normalized = normalize_text(pred)
    truth_normalized = normalize_text(truth)
    return int(pred_normalized in truth_normalized or truth_normalized in pred_normalized)

In [25]:
def postprocess_qa_predictions(examples, start_logits, end_logits, tokenizer):
    """
    Convert model logits into readable answers.
    """
    predictions = []

    for i in range(len(start_logits)):
        start_idx = np.argmax(start_logits[i])  # Best start index
        end_idx = np.argmax(end_logits[i])  # Best end index

        # Ensure valid span
        if start_idx >= len(examples["input_ids"][i]) or end_idx >= len(examples["input_ids"][i]):
            predictions.append("")
            continue

        if start_idx > end_idx:  # If invalid prediction
            predictions.append("")
            continue

        # Decode the predicted answer
        input_ids = examples["input_ids"][i]
        answer_tokens = input_ids[start_idx : end_idx + 1]
        prediction = tokenizer.decode(answer_tokens, skip_special_tokens=True)
        predictions.append(prediction)

    return predictions

In [26]:
metric = load("squad")

def compute_metrics(pred):
    squad_labels = pred.label_ids
    squad_preds = pred.predictions.argmax(-1)

    # Calculate Exact Match (EM)
    em = sum([1 if p == l else 0 for p, l in zip(squad_preds, squad_labels)]) / len(squad_labels)

    # Calculate F1-score
    f1 = f1_score(squad_labels, squad_preds, average='macro')

    return {
        'exact_match': em,
        'f1': f1
    }


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [None]:
val_dataset[0]

{'id': '01543-003',
 'article_id': 1543,
 'article_title': 'Japanese music video, nakig-collab sa Pinoy artists, midaog na sab',
 'article_body': 'ANG siyudad sa Hadano, Kanagawa Prefecture, Japan mipahigayon sa ilang 2nd Hadastragram movie contest niadtong Nobiyembre 6, 2023. Moabot ngadto sa 195 ka entry nga gisalmot diin ang "Bathroom Orchestra Instrumental" music video sa Japanese musician-film maker Jonneper Padil, a. k. a. iwapt, ang gideklarar nga grand prix champion. Ang maong music video adunay collaboration sa Filipino musician nga naglakip nilang Erlinda Leones ug Randy Lepasana, drummer sa OPM band nga Neocolours. Ang maong awit nga gi-compose ni iwapt gi-record sa Manila niadtong 2010. Ang iyang music video nag-promote sa Hadano City nga nagpakita sa inila nga underground spring water sa maong dapit. Gi-showcase sab ang ilang nature-rich parks ug observatories, sikat nila nga delicacies, ug ang dapit sa Hadano diin gipahigayon ang 2020 Tokyo Olympics.',
 'question': 'Kinsa

In [34]:
def finetune_xlmr():
    batch_timestamp = timestamp(1)
    # Suggest values for hyperparameters
    # learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    # batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    # num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    # weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)

    learning_rate = 2e-5
    batch_size = 8
    num_train_epochs = 3
    weight_decay = 0.01

    # Define training arguments with suggested values
    training_args = TrainingArguments(
        output_dir=get_output_directory(batch_timestamp),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        logging_dir=get_logs_directory(batch_timestamp),
        logging_steps=10,
        save_total_limit=3,
        # bf16=True  # Best for A100
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        # compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

     # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)

    model.save_pretrained(get_model_directory(batch_timestamp))
    tokenizer.save_pretrained(get_tokenizer_directory(batch_timestamp))

    return eval_results

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(finetune_xlmr, n_trials=1)

# Get the best trial
best_trial = study.best_trial
# Print best trial number and its hyperparameters
print(f"Best Trial: {best_trial.number}")
print("Best Hyperparameters:", best_trial.params)
print(f"Best F1 Score: {best_trial.value:.4f}")

[I 2025-03-10 13:44:01,276] A new study created in memory with name: no-name-525be363-eb88-442a-80fd-1fb72ab389ac


2025-03-10_13-44-0


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 0.1)


Epoch,Training Loss,Validation Loss


In [35]:
finetune_xlmr()

Epoch,Training Loss,Validation Loss
1,3.6006,4.228612
2,3.6346,3.889273
3,3.4471,3.95013


{'eval_loss': 3.889272928237915, 'eval_runtime': 74.7329, 'eval_samples_per_second': 36.958, 'eval_steps_per_second': 4.63, 'epoch': 3.0}


{'eval_loss': 3.889272928237915,
 'eval_runtime': 74.7329,
 'eval_samples_per_second': 36.958,
 'eval_steps_per_second': 4.63,
 'epoch': 3.0}

# **Evaluating the model**

## Evaluating

## Inference

In [40]:
# try:
#   model
#   tokenizer
# except:
#   print("none")

model_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-02-25_01/model"
tokenizer_path = "/content/drive/Shareddrives/cebqa_roberta/xlmr/2025-02-25_01/tokenizer"

model_best = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer_best = AutoTokenizer.from_pretrained(tokenizer_path)


qa_pipeline = pipeline("question-answering", model=model_best, tokenizer=tokenizer_best)


Device set to use cuda:0


In [42]:
test_dataset = Dataset.from_dict({
    "question": [sample["question"] for sample in dataset["test"]],
    "context": [sample["context"]["text"] for sample in dataset["test"]]
})
model_outputs = qa_pipeline(test_dataset)

results_list = []
for sample, model_output in zip(dataset["test"], model_outputs):
  # print(sample)
    expected_answer = sample["answer"]["text"] if sample["answer"]["text"] else "N/A"  # Handle empty answers
    actual_answer = model_output["answer"]

    results_list.append({
        "Question": sample["question"],
        "Expected Answer": expected_answer,
        "Actual Answer": actual_answer
    })

for result in results_list:
    expected = result["Expected Answer"]
    actual = result["Actual Answer"]

    result["F1 Score"] = compute_f1(actual, expected)
    result["Exact Match"] = compute_exact_match(actual, expected)
    result["Sentence Match"] = compute_sentence_match(actual, expected)


# Convert to DataFrame for better visualization
df = pd.DataFrame(results_list)

# Display as a table
display(df)


Unnamed: 0,Question,Expected Answer,Actual Answer,F1 Score,Exact Match,Sentence Match
0,Kinsa ang ig-agaw nga giatake sa sawa?,si Imee Niu,Imee Niu.,0.800000,0,1
1,Unsay gibuhat ni Brent Idica pagkahuman sa ins...,dali nga nakasibat,nisumbong sa kapulisan,0.000000,0,0
2,Unsa nga mga distrito ang giila sa dakbayan?,South Road Properties (SRP) New Business and H...,South Road Properties (SRP),0.347826,0,1
3,Unsa ang giingong hinungdan sa sunog?,nagsugod sa balay sa usa ka Juling Pamugas nga...,Juling Pamugas,0.333333,0,1
4,Kanus-a mobalik sa normal nga operasyon ang mg...,kon molurang ang dautang panahon,panahon.,0.333333,0,1
...,...,...,...,...,...,...
5521,Unsa ang gibuhat sa laing taga motorbanca sa k...,ang mga sakay gidali sa pagtabang sa laing tag...,ningsalmot ra usab sa Seaborne Procession.,0.125000,0,0
5522,Unsa ang giingon ni Pagulayan nga motivo sa pa...,ginadiling drugas,anggulo sa ginadiling drugas,0.666667,0,1
5523,Asa gikan ang duha ka out-of-town contingents?,Canlaon City (sa Negros Oriental) ug lungsod s...,Canlaon City (sa Negros Oriental),0.555556,0,1
5524,Kanus-a nagsugod ug natapos ang street dancing...,"udto sa Domingo, Agusto 25, 2024 hangtod nga n...","alas 2 sa kaadlawon sa Lunes, Agusto 26, 2024,",0.533333,0,1


In [43]:
avg_f1 = df["F1 Score"].mean()
avg_em = df["Exact Match"].mean()
avg_sm = df["Sentence Match"].mean()

print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Exact Match: {avg_em:.4f}")
print(f"Average Sentence Match: {avg_sm:.4f}")

Average F1 Score: 0.5589
Average Exact Match: 0.3444
Average Sentence Match: 0.6927
