# Evaluate TEST


In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import numpy as np
import os
import evaluate
from transformers import pipeline
import collections

2025-06-01 19:19:26.227454: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748805566.251071     569 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748805566.258410     569 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
from huggingface_hub import login

login(os.getenv("HF_TOKEN"))  # Dán token bạn vừa copy vào đây


In [4]:
from datasets import load_dataset

dataset = load_dataset("taidng/UIT-ViQuAD2.0")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'uit_id', 'title', 'context', 'question', 'answers', 'is_impossible', 'plausible_answers'],
        num_rows: 28454
    })
    validation: Dataset({
        features: ['id', 'uit_id', 'title', 'context', 'question', 'answers', 'is_impossible', 'plausible_answers'],
        num_rows: 3814
    })
    test: Dataset({
        features: ['id', 'uit_id', 'title', 'context', 'question', 'answers', 'is_impossible', 'plausible_answers'],
        num_rows: 7301
    })
})

In [5]:
# Tổng số phần tử trong tập train gốc
total_len = len(dataset["train"])
test_size = total_len // 10  # Lấy 1/10

# Chia tập train: 9/10 đầu giữ làm train, 1/10 cuối làm test
train_dataset = dataset["train"].select(range(0, total_len - test_size))
test_dataset = dataset["train"].select(range(total_len - test_size, total_len))
val_dataset = dataset["validation"]

# Kiểm tra kết quả
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")


Train size: 25609
Validation size: 3814
Test size: 2845


In [6]:
def prepare_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized["offset_mapping"]
    example_ids = []
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        sample_idx = sample_mapping[i]
        answer = examples["answers"][sample_idx]
        is_impossible = examples["is_impossible"][sample_idx]

        example_ids.append(examples["id"][sample_idx])

        if is_impossible or len(answer["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            tokenized["offset_mapping"][i] = [(0, 0)] * len(offsets)  # Đánh dấu là không hợp lệ
        else:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])
            sequence_ids = tokenized.sequence_ids(i)

            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - 1 - list(reversed(sequence_ids)).index(1)

            if offsets[context_start][0] > start_char or offsets[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
                tokenized["offset_mapping"][i] = [(0, 0)] * len(offsets)
            else:
                idx = context_start
                while idx <= context_end and offsets[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offsets[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    tokenized["example_id"] = example_ids
    return tokenized


In [8]:
# Load metric
squad_metric = evaluate.load("squad_v2")

In [9]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k["id"]: i for i, k in enumerate(examples)}
    features_per_example = collections.defaultdict(list)

    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    for example in examples:
        example_id = example["id"]
        context = example["context"]
        feature_indices = features_per_example[example_id]

        best_score = -float("inf")
        best_answer = ""

        for i in feature_indices:
            start_logits = all_start_logits[i]
            end_logits = all_end_logits[i]
            offset_mapping = features[i]["offset_mapping"]

            for start_index in np.argsort(start_logits)[-n_best_size:]:
                for end_index in np.argsort(end_logits)[-n_best_size:]:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                        or end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    score = start_logits[start_index] + end_logits[end_index]

                    if score > best_score:
                        best_score = score
                        best_answer = context[start_char:end_char]

        if example.get("is_impossible", False) or best_answer.strip() == "":
            predictions[example_id] = ""
        else:
            predictions[example_id] = best_answer.strip()

    return predictions

In [10]:
def compute_metrics(p):
    predictions = postprocess_qa_predictions(
        test_dataset,
        tokenized_test,
        p.predictions
    )

    formatted_predictions = [
        {
            "id": k,
            "prediction_text": v,
            "no_answer_probability": 0.0
        }
        for k, v in predictions.items()
    ]

    references = [
        {
            "id": ex["id"],
            "answers": ex["answers"]
            if ex["answers"]["text"]
            else {"text": [""], "answer_start": [0]}
        }
        for ex in test_dataset
    ]

    return squad_metric.compute(predictions=formatted_predictions, references=references)


In [11]:
model_ids = {
    "mBERT_QA": "DatTran0509/Finetune_mBERT_QA",
    "XLMR_RoBerta_Base": "DatTran0509/Finetune_XLM_R_base_QA_NEW",  # ví dụ: "vinai/phobert-base-qa"
    "XLMR_RoBerta_Large": "DatTran0509/Finetune_XLM_R_large_QA_New"   # ví dụ: "csarron/roberta-base-squad-v1"
}

In [13]:
# Lưu kết quả
results = {}

# Đánh giá từng model
for name, model_id in model_ids.items():
    print(f"🔍 Evaluating model: {name}")

    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    model = AutoModelForQuestionAnswering.from_pretrained(model_id)

    # Tokenize với tokenizer riêng
    tokenized_test = test_dataset.map(prepare_features, batched=True, remove_columns=test_dataset.column_names)

    training_args = TrainingArguments(
        output_dir=f"./results/{name.replace('/', '_')}",
        eval_strategy="epoch",
        per_device_eval_batch_size=32,
        logging_dir=f"./logs/{name.replace('/', '_')}",
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics,
    )

    eval_result = trainer.evaluate()
    results[name] = {
        "EM": eval_result.get("eval_exact", 0),
        "F1": eval_result.get("eval_f1", 0)
    }

    print(f"✅ {name} — EM: {results[name]['EM']:.2f}% | F1: {results[name]['F1']:.2f}%")
    print("-" * 50)

🔍 Evaluating model: mBERT_QA




✅ mBERT_QA — EM: 56.03% | F1: 69.58%
--------------------------------------------------
🔍 Evaluating model: XLMR_RoBerta_Base




✅ XLMR_RoBerta_Base — EM: 57.96% | F1: 71.69%
--------------------------------------------------
🔍 Evaluating model: XLMR_RoBerta_Large




✅ XLMR_RoBerta_Large — EM: 63.73% | F1: 79.86%
--------------------------------------------------
