In [1]:
import torch
from chunkformer_vpb.training.finetune_utils import (
    get_default_args,
    prepare_input_file,
    load_model_only,
    compute_chunkformer_loss,
)
from chunkformer_vpb.training.tokenizer import normalize_vi, GreedyTokenizer
from torch.nn.utils.rnn import pad_sequence

import os, math, torch
from chunkformer_vpb.training.finetune_config import FinetuneConfig
from chunkformer_vpb.training.data_loader     import get_dataloaders
from chunkformer_vpb.training.optimizer       import build_model_and_optimizer
from chunkformer_vpb.training.finetune_utils  import compute_loss_batch_v1, compute_loss_batch_v2


CFG_PATH   = "../../config/finetune_config.yaml"

def debug_text_pipeline(label_text: str, tokenizer: GreedyTokenizer):
    print(">>> Text pipeline debug <<<")
    print("  Original text :", label_text)
    norm = normalize_vi(label_text)
    print("  Normalized   :", norm)
    ids = tokenizer.tokenize(norm)
    print("  Token IDs    :", ids)
    try:
        dec = tokenizer.decode_ids(ids)
    except AttributeError:
        # nếu chưa có decode_ids
        dec = "".join(tokenizer.vocab[id] for id in ids)
    print("  Decoded      :", dec)
    print()

def main():
    # 1) chuẩn bị args + device
    args = get_default_args()
    args.model_checkpoint = "../../../chunkformer-large-vie"
    args.audio_path       = "./cache_train/raw/utt_000664.wav"
    args.label_text       = "một giọng nói du dương không thể lẫn với ai khác cất lên"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 2) load model + tokenizer
    model, _ = load_model_only(args.model_checkpoint, device)
    model.ctc_weight = 0.3
    tokenizer = GreedyTokenizer(vocab_path=f"{args.model_checkpoint}/vocab.txt")

    # 3) debug text pipeline (single)
    debug_text_pipeline(args.label_text, tokenizer)

    # 4) prepare single‐sample features + loss
    xs = prepare_input_file(args.audio_path, device)  # [1, T_raw, 80]

    # 5) now wrap into a batch of size 1 and run batch version
    #    (giả sử bạn có hàm compute_loss_batch_v1 cho batch)
    from chunkformer_vpb.training.finetune_utils import compute_loss_batch_v1
    feats = xs
    feat_lens = torch.tensor([xs.shape[1]], device=device)
    # build toks/tok_lens exactly như in DataLoader
    norm = normalize_vi(args.label_text)
    ids = tokenizer.tokenize(norm)
    toks = torch.LongTensor([ids]).to(device)
    tok_lens = torch.tensor([len(ids)], device=device)

    feats     = pad_sequence(feats, batch_first=True)                   # [B, T_max, D]
    # feat_lens = torch.LongTensor(feat_lens)                             # [B]
    toks      = pad_sequence(toks, batch_first=True, padding_value=0)    # [B, L_max]
    # tok_lens  = torch.LongTensor(tok_lens)                               # [B]


    from chunkformer_vpb.training.finetune_config import FinetuneConfig
    from chunkformer_vpb.training.data_loader     import get_dataloaders

    CFG_PATH = "../../config/finetune_config.yaml"
    cfg = FinetuneConfig.from_yaml(CFG_PATH)

    # để nhanh: batch nhỏ & shuffle false
    cfg.training.batch_size = 1
    cfg.training.shuffle    = False

    train_loader, _ = get_dataloaders(cfg)

    manual_debug_done = False

    for step, (loader_feats, loader_feat_lens, loader_toks, loader_tok_lens) in enumerate(train_loader, 1):
        print("\n=== DataLoader batch debug ===")
        print(f"Step: {step}")
        print("loader_feats.shape    :", loader_feats.shape)
        print("loader_feat_lens      :", loader_feat_lens)
        print("loader_toks.shape     :", loader_toks.shape)
        print("loader_tok_lens       :", loader_tok_lens)
        
        # Nếu bạn đã có feats, feat_lens, toks, tok_lens tự chuẩn bị:
        # In ra shape, giá trị, hoặc compare từng phần tử
        
        print("\n--- Compare with manual data ---")
        print("Manual feats.shape    :", feats.shape)
        print("Manual feat_lens      :", feat_lens)
        print("Manual toks.shape     :", toks.shape)
        print("Manual tok_lens       :", tok_lens)

        # So sánh từng phần tử (giả sử đều batch_size=1)
        diff_feats = (loader_feats[0] - feats[0]).abs().max().item()
        diff_toks  = (loader_toks[0] - toks[0]).abs().max().item()
        print(f"Max abs diff feats: {diff_feats}")
        print(f"Max abs diff toks : {diff_toks}")

        # Nếu muốn xem trực tiếp giá trị khác biệt (ví dụ print những index khác biệt)
        # Nếu nhỏ hơn threshold, có thể coi là trùng
        print("Loader feat_lens:", loader_feat_lens[0].item(), "Manual feat_lens:", feat_lens[0].item())
        print("Loader tok_lens :", loader_tok_lens[0].item(), "Manual tok_lens :", tok_lens[0].item())
        
        # Chỉ chạy 1 batch để debug
        break




    cfg = FinetuneConfig.from_yaml(CFG_PATH)

    # để nhanh: batch nhỏ & shuffle false
    cfg.training.batch_size = 1
    cfg.training.shuffle    = False

    print(">>> Batch‐of‐1 loss <<<")
    loss_b, loss_b_ctc, loss_b_att = compute_loss_batch_v1(
        model, feats, feat_lens, toks, tok_lens, cfg, device
    )
    print(f"Batch1: loss={loss_b:.3f},  ctc={loss_b_ctc:.3f}, att={loss_b_att:.3f}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
[2025-07-16 19:33:24] INFO: Checkpoint: loading from checkpoint ../../../chunkformer-large-vie/pytorch_model.bin for GPU



🧾 Loaded checkpoint from: ../../../chunkformer-large-vie/pytorch_model.bin
📦 Checkpoint keys: ['encoder.global_cmvn.mean', 'encoder.global_cmvn.istd', 'encoder.embed.out.weight', 'encoder.embed.out.bias', 'encoder.embed.conv.0.weight'] ... (total 813)
🔍 AED decoder head included in checkpoint? ✅ YES
📊 Model total params: 113,852,240, trainable: 113,852,240
>>> Text pipeline debug <<<
  Original text : một giọng nói du dương không thể lẫn với ai khác cất lên
  Normalized   : một giọng nói du dương không thể lẫn với ai khác cất lên
  Token IDs    : [4104, 2648, 4564, 2300, 2356, 3305, 5697, 3688, 6411, 1333, 3277, 2146, 3597]
  Decoded      : một giọng nói du dương không thể lẫn với ai khác cất lên

[collate] sample 0: utt_id=utt_000694, audio=cache_train/raw/utt_000694.wav

=== DataLoader batch debug ===
Step: 1
loader_feats.shape    : torch.Size([1, 626, 80])
loader_feat_lens      : tensor([626])
loader_toks.shape     : torch.Size([1, 15])
loader_tok_lens       : tensor([15])

--- Com

RuntimeError: The size of tensor a (626) must match the size of tensor b (423) at non-singleton dimension 0