# Preparing the dataset


In [None]:
TRAIN_MANIFEST = "datasets/vivos/train_manifest.json"
TEST_MANIFEST = "datasets/vivos/test_manifest.json"

## Preparing the tokenizer


In [None]:
import os

VOCAB_SIZE = 128  # can be any value above 29
TOKENIZER_TYPE = "spe"  # can be wpe or spe
SPE_TYPE = "bpe"  # can be bpe or unigram

# ------------------------------------------------------------------- #
!rm -r tokenizers/

if not os.path.exists("tokenizers"):
  os.makedirs("tokenizers")

!python scripts/process_asr_text_tokenizer.py \
   --manifest=$TRAIN_MANIFEST \
   --data_root="tokenizers" \
   --tokenizer=$TOKENIZER_TYPE \
   --spe_type=$SPE_TYPE \
   --no_lower_case \
   --log \
   --vocab_size=$VOCAB_SIZE

In [None]:
# Tokenizer path
if TOKENIZER_TYPE == 'spe':
  TOKENIZER = os.path.join("tokenizers", f"tokenizer_spe_{SPE_TYPE}_v{VOCAB_SIZE}")
  TOKENIZER_TYPE_CFG = "bpe"
else:
  TOKENIZER = os.path.join("tokenizers", f"tokenizer_wpe_v{VOCAB_SIZE}")
  TOKENIZER_TYPE_CFG = "wpe"

# Load model config

In [None]:
from omegaconf import OmegaConf, open_dict

config = OmegaConf.load("configs/contextnet_rnnt.yaml")
# config = OmegaConf.load("configs/fast-conformer_transducer_bpe.yaml")
config.model.train_ds.manifest_filepath = TRAIN_MANIFEST
config.model.validation_ds.manifest_filepath = TEST_MANIFEST
config.model.test_ds.manifest_filepath = TEST_MANIFEST

config.model.tokenizer.dir = TOKENIZER
config.model.tokenizer.type = TOKENIZER_TYPE_CFG

# Finally, let's remove logging of samples and the warmup since the dataset is small (similar to CTC models)
config.model.log_prediction = False
config.model.optim.sched.warmup_steps = None

config.model.spec_augment.freq_masks = 0
config.model.spec_augment.time_masks = 0

config.model.encoder.jasper = config.model.encoder.jasper[:5]
config.model.encoder.jasper[-1].filters = '${model.model_defaults.enc_hidden}'
# config.model.encoder.n_layers = 6
# config.model.encoder.d_model = 176
# config.model.encoder.n_heads = 1
# config.model.train_ds.max_duration = 5
# config.model.encoder.conv_kernel_size = 17

# Two lines to enable the fused batch step
config.model.joint.fuse_loss_wer = True
config.model.joint.fused_batch_size = 16  # this can be any value (preferably less than model.*_ds.batch_size)

# We will also reduce the hidden dimension of the joint and the prediction networks to preserve some memory
config.model.model_defaults.pred_hidden = 64
config.model.model_defaults.joint_hidden = 64
config.model.model_defaults.filters = 128

## Initialize a Transducer ASR Model


In [None]:
import torch
from lightning.pytorch import Trainer

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'gpu'

EPOCHS = 10

# Initialize a Trainer for the Transducer model
trainer = Trainer(devices=1, accelerator=accelerator, max_epochs=EPOCHS,
                  enable_checkpointing=False, logger=False,
                  log_every_n_steps=10, check_val_every_n_epoch=5)

In [None]:
import nemo.collections.asr as nemo_asr
model = nemo_asr.models.EncDecRNNTBPEModel(cfg=config.model, trainer=trainer)
model.summarize()

# Training

In [None]:
# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us
from nemo.utils import exp_manager

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

exp_config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/',
    name=f"Transducer-Model",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

exp_config = OmegaConf.structured(exp_config)

logdir = exp_manager.exp_manager(trainer, exp_config)

In [None]:
try:
  from google import colab
  COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
  COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
  %load_ext tensorboard
  %tensorboard --logdir /content/experiments/Transducer-Model/
else:
  print("To use TensorBoard, please use this notebook in a Google Colab environment.")

In [None]:
# Release resources prior to training
import gc
gc.collect()

if accelerator == 'gpu':
  torch.cuda.empty_cache()

In [None]:
trainer.fit(model)

In [3]:
import nemo.collections.asr as nemo_asr
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("experiments/vpb_asr_fastconformer_transducer_bpe/2025-07-19_15-59-19/checkpoints/vpb_asr_fastconformer_transducer_bpe.nemo")

[NeMo I 2025-07-19 21:20:28 mixins:181] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2025-07-19 21:20:28 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: datasets/vivos/train_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 17.125
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-07-19 21:20:28 modelPT:187] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: datasets/vivos/test_manifest.json
    sample_rate: 16000
    batch_size: 16
    shuffle: false
    use_start_end_t

[NeMo I 2025-07-19 21:20:28 features:305] PADDING: 0
[NeMo I 2025-07-19 21:20:28 rnnt_models:226] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-07-19 21:20:28 rnnt_models:226] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-07-19 21:20:28 rnnt_models:226] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-07-19 21:20:29 save_restore_connector:275] Model EncDecRNNTBPEModel was successfully restored from /home/ubuntu/nvidia_nemo/tutorials/asr/experiments/vpb_asr_fastconformer_transducer_bpe/2025-07-19_15-59-19/checkpoints/vpb_asr_fastconformer_transducer_bpe.nemo.


In [6]:
output = model.transcribe('/home/ubuntu/.cache/kagglehub/datasets/kynthesis/vivos-vietnamese-speech-corpus-for-asr/versions/1/vivos/test/waves/VIVOSDEV01/VIVOSDEV01_R002.wav')
print("Transcribe text: ", output)

utput = model.transcribe('datasets/vivos/test/augumented_8k_waves/VIVOSDEV01/VIVOSDEV01_R002.wav')
print("Transcribe text: ", output)


Transcribing: 100%|██████████| 1/1 [00:00<00:00, 22.65it/s]


Transcribe text:  [Hypothesis(score=483.499755859375, y_sequence=tensor([ 11,   3,  18,   4,  16,  40,  10,   1, 117,  31,  10,  23,  16, 109,
          2,   1, 122, 114,   3,  35,   1, 111,   3,   1,   8,  44,  16,  46,
         24,   1, 116, 118,  10,  23], device='cuda:0'), text='tiếng cọc kịch cận lại của ối những cớp sách', dec_out=None, dec_state=None, timestamp=[], alignments=None, frame_confidence=None, token_confidence=None, word_confidence=None, length=0, y=None, lm_state=None, lm_scores=None, ngram_lm_state=None, tokens=None, last_token=None, token_duration=None, last_frame=None)]


Transcribing: 100%|██████████| 1/1 [00:00<00:00, 30.18it/s]

Transcribe text:  [Hypothesis(score=483.499755859375, y_sequence=tensor([ 11,   3,  18,   4,  16,  40,  10,   1, 117,  31,  10,  23,  16, 109,
          2,   1, 122, 114,   3,  35,   1, 111,   3,   1,   8,  44,  16,  46,
         24,   1, 116, 118,  10,  23], device='cuda:0'), text='tiếng cọc kịch cận lại của ối những cớp sách', dec_out=None, dec_state=None, timestamp=[], alignments=None, frame_confidence=None, token_confidence=None, word_confidence=None, length=0, y=None, lm_state=None, lm_scores=None, ngram_lm_state=None, tokens=None, last_token=None, token_duration=None, last_frame=None)]



