In [1]:
import os
import json
from pathlib import Path
from chunkformer_vpb.training.tokenizer import normalize_vi

def get_all_txt_files(dirs):
    txt_paths = []
    for d in dirs:
        if d.exists():
            txt_paths.extend(sorted(d.glob("*.txt")))
    return txt_paths

def get_stems(files):
    return set(f.stem for f in files)

def generate_next_day_test_meta(base_dir: str):
    base = Path(base_dir).expanduser().resolve()
    # vpb_dataset/archive/tts_dataset_best_call_agent_audio/tts_dataset_best_call_agent_audio/transcripts
    transcript_base_dir = base / "archive/tts_dataset_best_call_agent_audio/tts_dataset_best_call_agent_audio/transcripts"
    wavs_dir = base / "archive/tts_dataset_best_call_agent_audio/tts_dataset_best_call_agent_audio/wavs"
    output_path = base / "standard_test/next_day_test_meta.json"

    label_base = base / "label"

    # === Folder cũ và mới ===
    old_dirs = [
        label_base / "transcript_corrected_hieudm13_cutoff_20250721/transcript_corrected",
        label_base / "transcripts_corrected_quangdm4_cutoff_21_7/transcripts_corrected",
    ]
    new_dirs = [
        label_base / "transcript_corrected_hieudm13_cutoff_20250724/transcript_corrected",
        label_base / "transcripts_corrected_quangdm4_cutoff_24_07_5317/transcripts_corrected",
    ]

    old_files = get_all_txt_files(old_dirs)
    new_files = get_all_txt_files(new_dirs)

    old_stems = get_stems(old_files)
    new_stems = get_stems(new_files)

    diff_stems = sorted(new_stems - old_stems)
    print(f"🔍 Found {len(diff_stems)} new-only transcripts")

    # === Build mapping stem → path for new transcripts ===
    stem_to_path = {f.stem: f for f in new_files}

    entries = []
    for stem in diff_stems:
        txt_path = stem_to_path[stem]
        wav_path = wavs_dir / f"{stem}.wav"
        print(f"📄 Processing: {stem}")
        if not wav_path.exists():
            print(f"⚠️  Missing audio: {stem}")
            continue

        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
        if not text:
            print(f"⚠️  Empty transcript: {stem}")
            continue

        # === Đường dẫn tương đối từ base_dir ===
        relative_audio_path = wav_path.relative_to(base)

        # === Base prediction (nếu có) ===
        base_pred_path = transcript_base_dir / f"{stem}.txt"
        if base_pred_path.exists():
            base_text = base_pred_path.read_text(encoding="utf-8").strip()
        else:
            base_text = ""

        entries.append({
            "utt_id": stem,
            "audio_path": str(relative_audio_path),
            "text": normalize_vi(text),
            "base_text": normalize_vi(base_text)
        })

    # === Save output JSON ===
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(entries, f, ensure_ascii=False, indent=2)

    print(f"✅ Created {len(entries)} entries")
    print(f"📄 Saved to: {output_path}")

# 👉 Cách gọi
if __name__ == "__main__":
    generate_next_day_test_meta("../../../vpb_dataset")


🔍 Found 1799 new-only transcripts
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000027990___right___000029162
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000029014___right___000029770
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000029462___right___000030378
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000029878___right___000031050
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000031190___left___000032426
⚠️  Empty transcript: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000031190___left___000032426
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000056118___right___000057706
📄 Processing: E_huongds_D_2025-04-09_H_092332_720_CLID_0942487879___000122742___right___000124330
📄 Processing: E_huongds_D_2025-04-09_H_095609_593_CLID_0919818467___000000342___left___000001130
📄 Processing: E_huongds_D_2025-04-09_H_095609_593_CLID_0919818467___000000726__