In [1]:
import os
import json
from pathlib import Path
from chunkformer_vpb.training.tokenizer import normalize_vi

def generate_test_meta(base_dir: str):
    base = Path(base_dir)
    standard_dir = base / "standard_test"
    transcript_dir = standard_dir / "transcripts"
    transcript_base = standard_dir / "transcript_base"  # ✅ Đường dẫn đúng cho base prediction
    wavs_dir = standard_dir / "wavs"
    output_path = standard_dir / "test_meta.json"

    entries = []

    for txt_file in sorted(transcript_dir.glob("*.txt")):
        utt_id = txt_file.stem
        wav_file = wavs_dir / f"{utt_id}.wav"

        if not wav_file.exists():
            print(f"⚠️  Missing audio for {utt_id}")
            continue

        with open(txt_file, "r", encoding="utf-8") as f:
            text = f.read().strip()

        if not text:
            print(f"⚠️  Empty transcript for {utt_id}")
            continue

        # === Đường dẫn tương đối từ base_dir ===
        relative_audio_path = wav_file.relative_to(base)

        # === Đọc kết quả dự đoán từ mô hình base ===
        base_pred_file = transcript_base / f"{utt_id}.txt"
        if base_pred_file.exists():
            with open(base_pred_file, "r", encoding="utf-8") as pf:
                base_text = pf.read().strip()
        else:
            print(f"⚠️  Missing base prediction for {utt_id}")
            base_text = ""

        entry = {
            "utt_id": utt_id,
            "audio_path": str(relative_audio_path),
            "text": normalize_vi(text),
            "base_text": normalize_vi(base_text)
        }
        entries.append(entry)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(entries, f, ensure_ascii=False, indent=2)

    print(f"✅ Created {len(entries)} entries")
    print(f"📄 Saved to: {output_path}")

# 👉 Cách gọi:
if __name__ == "__main__":
    generate_test_meta("../../../vpb_dataset")


✅ Created 29 entries
📄 Saved to: ../../../vpb_dataset/standard_test/test_meta.json


In [1]:
from chunkformer_vpb.training.train import evaluate_from_meta

evaluate_from_meta("../../../vpb_dataset/standard_test/test_meta.json")

  from .autonotebook import tqdm as notebook_tqdm


📊 Tổng số mẫu: 29
🎯 WER trung bình (sample avg): 24.87%
🌐 WER toàn cục   (global):     16.13%
