# setup

In [2]:
# !pip install -r ../requirements.txt

In [1]:
!mkdir -p scripts
!mkdir -p configs

In [2]:
import os

if not os.path.exists("./scripts/tokenizers/process_asr_text_tokenizer.py"):
  !wget -P scripts/tokenizers/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py

if not os.path.exists("./configs/config_bpe.yaml"):
    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/citrinet/config_bpe.yaml

# prepare dataset

In [3]:
data_dir = "/home/jovyan/.cache/nemo-simple-example/"
!mkdir -p $data_dir

In [4]:
import sys
import glob
import os
import subprocess
import tarfile
import wget

def custom_progress_bar(current, total, width=50):
    progress = int(width * current / total)
    sys.stdout.write('\r[{}{}] {:.1f}%'.format(
        '#' * progress, '.' * (width - progress), 100 * current / total))
    sys.stdout.flush()

print("******")
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    print("Download the dataset. This will take a few moments...")
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'
    an4_path = wget.download(an4_url, data_dir, bar=custom_progress_bar)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

if not os.path.exists(data_dir + '/an4/'):
    # Untar and convert .sph to .wav (using sox)
    tar = tarfile.open(an4_path)
    tar.extractall(path=data_dir)

    print("Converting .sph to .wav...")
    sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
    for sph_path in sph_list:
        wav_path = sph_path[:-4] + '.wav'
        cmd = ["sox", sph_path, wav_path]
        subprocess.run(cmd)
print("Finished conversion.\n******")

******
Tarfile already exists.
Finished conversion.
******


In [5]:
# --- Building Manifest Files --- #
import json
import librosa

# Function to build a manifest
def build_manifest(transcripts_path, manifest_path, wav_path):
    with open(transcripts_path, 'r') as fin:
        with open(manifest_path, 'w') as fout:
            for line in fin:
                # Lines look like this:
                # <s> transcript </s> (fileID)
                transcript = line[: line.find('(')-1].lower()
                transcript = transcript.replace('<s>', '').replace('</s>', '')
                transcript = transcript.strip()

                file_id = line[line.find('(')+1 : -2]  # e.g. "cen4-fash-b"
                audio_path = os.path.join(
                    data_dir, wav_path,
                    file_id[file_id.find('-')+1 : file_id.rfind('-')],
                    file_id + '.wav')

                duration = librosa.core.get_duration(filename=audio_path)

                # Write the metadata to the manifest
                metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": transcript
                }
                json.dump(metadata, fout)
                fout.write('\n')
                
print("******")
print("Building Manifests...")
train_transcripts = data_dir + '/an4/etc/an4_train.transcription'
train_manifest = data_dir + '/an4/train_manifest.json'
if not os.path.isfile(train_manifest):
    build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')
    print("Training manifest created.")

test_transcripts = data_dir + '/an4/etc/an4_test.transcription'
test_manifest = data_dir + '/an4/test_manifest.json'
if not os.path.isfile(test_manifest):
    build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')
    print("Test manifest created.")
print("***Done***")

******
Building Manifests...
***Done***


In [7]:
!head -n 5 {data_dir}/an4/train_manifest.json

{"audio_filepath": "/home/jovyan/.cache/nemo-simple-example/an4/wav/an4_clstk/fash/an251-fash-b.wav", "duration": 1.0, "text": "yes"}
{"audio_filepath": "/home/jovyan/.cache/nemo-simple-example/an4/wav/an4_clstk/fash/an253-fash-b.wav", "duration": 0.7, "text": "go"}
{"audio_filepath": "/home/jovyan/.cache/nemo-simple-example/an4/wav/an4_clstk/fash/an254-fash-b.wav", "duration": 0.9, "text": "yes"}
{"audio_filepath": "/home/jovyan/.cache/nemo-simple-example/an4/wav/an4_clstk/fash/an255-fash-b.wav", "duration": 2.6, "text": "u m n y h six"}
{"audio_filepath": "/home/jovyan/.cache/nemo-simple-example/an4/wav/an4_clstk/fash/cen1-fash-b.wav", "duration": 3.5, "text": "h i n i c h"}


# build tokenizer

In [12]:
!python ./scripts/tokenizers/process_asr_text_tokenizer.py \
  --manifest="{data_dir}/an4/train_manifest.json" \
  --data_root="{data_dir}/tokenizers/an4/" \
  --vocab_size=32 \
  --tokenizer="spe" \
  --no_lower_case \
  --spe_type="unigram" \
  --log

INFO:root:Corpus already exists at path : /home/jovyan/.cache/nemo-simple-example//tokenizers/an4/text_corpus/document.txt
[NeMo I 2025-07-19 08:45:15 nemo_logging:393] Processing /home/jovyan/.cache/nemo-simple-example//tokenizers/an4/text_corpus/document.txt and store at /home/jovyan/.cache/nemo-simple-example//tokenizers/an4/tokenizer_spe_unigram_v32
sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/home/jovyan/.cache/nemo-simple-example//tokenizers/an4/text_corpus/document.txt --model_prefix=/home/jovyan/.cache/nemo-simple-example//tokenizers/an4/tokenizer_spe_unigram_v32/tokenizer --vocab_size=32 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=unigram --character_coverage=1.0 --bos_id=-1 --eos_id=-1 --remove_extra_whitespaces=false
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /home/jovyan/.cache/nemo-simple-example//tokenizers/an4/text_corpus/document.txt
  input_format: 
  model_prefix: /home/jovyan/.ca

In [13]:
!head -n 10 {data_dir}/tokenizers/an4/tokenizer_spe_unigram_v32/vocab.txt

▁
##e
##t
##r
##o
##a
##h
one
two
##u


# model training

In [8]:
import nemo
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf, open_dict

In [9]:
params = OmegaConf.load("./configs/config_bpe.yaml")
print(OmegaConf.to_yaml(params))

name: ContextNet5x1
sample_rate: 16000
repeat: 1
dropout: 0.0
separable: true
model:
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    num_workers: 8
    pin_memory: true
    is_tarred: false
    tarred_audio_filepaths: null
    shard_strategy: scatter
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
  validation_ds:
    manifest_filepath: ???
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    num_workers: 8
    pin_memory: true
  tokenizer:
    dir: ???
    type: ???
  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    normalize: per_feature
    window_size: 0.02
    sample_rate: 16000
    window_stride: 0.01
    window: hann
    features: 64
    n_fft: 512
    frame_splicing: 1
    dither: 1.0e-05
  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    rec

In [45]:
data_dir = "/home/jovyan/.cache/nemo-simple-example/"
train_manifest = data_dir + '/an4/train_manifest.json'
test_manifest = data_dir + '/an4/test_manifest.json'

params.model.train_ds.manifest_filepath = train_manifest
params.model.validation_ds.manifest_filepath = test_manifest

params.model.spec_augment.rect_masks = 0

In [46]:
params.model.tokenizer.dir = data_dir + "/tokenizers/an4/tokenizer_spe_unigram_v32/"
params.model.tokenizer.type = "bpe"

In [16]:
import lightning.pytorch as pl
trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=10)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [17]:
params.model.train_ds.batch_size = 8

In [18]:
first_asr_model = nemo_asr.models.EncDecCTCModelBPE(cfg=params.model, trainer=trainer)

[NeMo I 2025-07-19 09:07:18 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2025-07-19 09:07:18 nemo_logging:393] 
    Replacing placeholder number of classes (-1) with actual number of classes - 32
[NeMo I 2025-07-19 09:07:18 nemo_logging:393] Dataset loaded with 948 files totalling 0.71 hours
[NeMo I 2025-07-19 09:07:18 nemo_logging:393] 0 files were filtered totalling 0.00 hours
[NeMo I 2025-07-19 09:07:18 nemo_logging:393] Dataset loaded with 130 files totalling 0.10 hours
[NeMo I 2025-07-19 09:07:18 nemo_logging:393] 0 files were filtered totalling 0.00 hours
[NeMo I 2025-07-19 09:07:18 nemo_logging:393] PADDING: 16


In [19]:
trainer.fit(first_asr_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2025-07-19 09:07:29 nemo_logging:393] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.999]
        capturable: False
        decoupled_weight_decay: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.1
        maximize: False
        weight_decay: 0.0001
    )
[NeMo I 2025-07-19 09:07:29 nemo_logging:393] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f68b218e1e0>" 
    will be used during training (effective maximum steps = 1190) - 
    Parameters : 
    (warmup_steps: null
    warmup_ratio: 0.05
    min_lr: 1.0e-06
    last_epoch: -1
    max_steps: 1190
    )



  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | ConvASREncoder                    | 1.2 M  | train
2 | decoder           | ConvASRDecoder                    | 33.8 K | train
3 | loss              | CTCLoss                           | 0      | train
4 | spec_augmentation | SpectrogramAugmentation           | 0      | train
5 | wer               | WER                               | 0      | train
--------------------------------------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.852     Total estimated model params size (MB)
134       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [21]:
first_asr_model.save_to(f"{data_dir}/first_model.nemo")

In [22]:
print(params.model.optim)

{'name': 'adam', 'lr': 0.1, 'betas': [0.9, 0.999], 'weight_decay': 0.0001, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': None, 'warmup_ratio': 0.05, 'min_lr': 1e-06, 'last_epoch': -1}}


# inference

In [33]:
data_dir = "/home/jovyan/.cache/nemo-simple-example/"

In [24]:
first_asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(f"{data_dir}/first_model.nemo")

[NeMo I 2025-07-19 09:11:00 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 32 tokens


[NeMo W 2025-07-19 09:11:00 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /home/jovyan/.cache/nemo-simple-example//an4/train_manifest.json
    sample_rate: 16000
    batch_size: 8
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    num_workers: 8
    pin_memory: true
    is_tarred: false
    tarred_audio_filepaths: null
    shard_strategy: scatter
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-07-19 09:11:00 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /home/jovyan/.cache/nemo-simple-example//an4/te

[NeMo I 2025-07-19 09:11:00 nemo_logging:393] PADDING: 16
[NeMo I 2025-07-19 09:11:00 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /home/jovyan/.cache/nemo-simple-example/first_model.nemo.


In [27]:
print(first_asr_model.transcribe(audio=[data_dir + '/an4/wav/an4_clstk/mgah/cen2-mgah-b.wav',
                                                    data_dir + '/an4/wav/an4_clstk/fmjd/cen7-fmjd-b.wav',
                                                    data_dir + '/an4/wav/an4_clstk/fmjd/cen8-fmjd-b.wav',
                                                    data_dir + '/an4/wav/an4_clstk/fkai/cen8-fkai-b.wav'],
                                 batch_size=4))


Transcribing: 100%|██████████| 1/1 [00:00<00:00, 30.11it/s]

[Hypothesis(score=tensor(-1.2118), y_sequence=tensor([32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32,  1,  1, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32,  1, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,  1,  1, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32,  1, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32,  1, 32, 32, 32, 32, 32, 32, 32, 32, 32




In [29]:
# Bigger batch-size = bigger throughput
params['model']['validation_ds']['batch_size'] = 16

# Setup the test data loader and make sure the model is on GPU
first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])
first_asr_model.cuda()
first_asr_model.eval()

# We remove some preprocessing artifacts which benefit training
first_asr_model.preprocessor.featurizer.pad_to = 0
first_asr_model.preprocessor.featurizer.dither = 0.0

# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.
# WER is computed as numerator/denominator.
# We'll gather all the test batches' numerators and denominators.
wer_nums = []
wer_denoms = []

# Loop over all test batches.
# Iterating over the model's `test_dataloader` will give us:
# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)
# See the AudioToCharDataset for more details.
for test_batch in first_asr_model.test_dataloader():
        test_batch = [x.cuda() for x in test_batch]
        targets = test_batch[2]
        targets_lengths = test_batch[3]        
        log_probs, encoded_len, greedy_predictions = first_asr_model(
            input_signal=test_batch[0], input_signal_length=test_batch[1]
        )
        # Notice the model has a helper object to compute WER
        first_asr_model.wer.update(greedy_predictions, None, targets, targets_lengths)
        _, wer_num, wer_denom = first_asr_model.wer.compute()
        wer_nums.append(wer_num.detach().cpu().numpy())
        wer_denoms.append(wer_denom.detach().cpu().numpy())

# We need to sum all numerators and denominators first. Then divide.
print(f"WER = {sum(wer_nums)/sum(wer_denoms)}")

[NeMo I 2025-07-19 09:16:15 nemo_logging:393] Dataset loaded with 130 files totalling 0.10 hours
[NeMo I 2025-07-19 09:16:15 nemo_logging:393] 0 files were filtered totalling 0.00 hours


[NeMo W 2025-07-19 09:16:16 nemo_logging:405] Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0].


WER = 0.9547218628719275


# model finetuning

In [3]:
data_dir = "/home/jovyan/.cache/nemo-simple-example/"
train_manifest = data_dir + '/an4/train_manifest.json'
test_manifest = data_dir + '/an4/test_manifest.json'

In [6]:
import copy
from omegaconf import OmegaConf, open_dict

params = OmegaConf.load("./configs/config_bpe.yaml")
params.model.train_ds.manifest_filepath = train_manifest
params.model.validation_ds.manifest_filepath = test_manifest
params.model.spec_augment.rect_masks = 0

new_opt = copy.deepcopy(params.model.optim)
new_opt.lr = 0.1

In [7]:
!python ./scripts/tokenizers/process_asr_text_tokenizer.py \
  --manifest="{data_dir}/an4/train_manifest.json" \
  --data_root="{data_dir}/tokenizers/an4/" \
  --vocab_size=64 \
  --tokenizer="wpe" \
  --no_lower_case \
  --log

INFO:root:Corpus already exists at path : /home/jovyan/.cache/nemo-simple-example//tokenizers/an4/text_corpus/document.txt
[2K[00:00:00] Tokenize words                 ██████████████████ 99       /       99[00:00:00] Tokenize words                 ██████████████████ 0        /        0
[2K[00:00:00] Count pairs                    ██████████████████ 99       /       99
[2K[00:00:00] Compute merges                 ██████████████████ 11       /       11
Serialized tokenizer at location : /home/jovyan/.cache/nemo-simple-example//tokenizers/an4/tokenizer_wpe_v64
INFO:root:Done!


In [14]:
import lightning.pytorch as pl
import nemo.collections.asr as nemo_asr

restored_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(f"{data_dir}/first_model.nemo")

[NeMo I 2025-07-19 09:31:28 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 32 tokens


[NeMo W 2025-07-19 09:31:28 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /home/jovyan/.cache/nemo-simple-example//an4/train_manifest.json
    sample_rate: 16000
    batch_size: 8
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    num_workers: 8
    pin_memory: true
    is_tarred: false
    tarred_audio_filepaths: null
    shard_strategy: scatter
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-07-19 09:31:28 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /home/jovyan/.cache/nemo-simple-example//an4/te

[NeMo I 2025-07-19 09:31:28 nemo_logging:393] PADDING: 16
[NeMo I 2025-07-19 09:31:28 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /home/jovyan/.cache/nemo-simple-example/first_model.nemo.


In [15]:
print(restored_model.decoder.vocabulary)

restored_model.change_vocabulary(
    new_tokenizer_dir=data_dir + "/tokenizers/an4/tokenizer_wpe_v64/",
    new_tokenizer_type="wpe"
)

[NeMo W 2025-07-19 09:31:31 nemo_logging:405] You tried to register an artifact under config key=tokenizer.vocab_path but an artifact for it has already been registered.


['<unk>', '▁', 'e', 't', 'r', 'o', 'a', 'h', '▁one', '▁two', 'u', 's', 'y', '▁six', '▁five', 'i', 'b', 'l', 'p', 'd', 'g', 'f', 'm', 'c', 'v', 'x', 'j', 'k', 'z', 'w', 'n', 'q']
[NeMo I 2025-07-19 09:31:31 nemo_logging:393] Tokenizer AutoTokenizer initialized with 64 tokens
[NeMo I 2025-07-19 09:31:31 nemo_logging:393] 
    Replacing old number of classes (32) with new number of classes - 64
[NeMo I 2025-07-19 09:31:31 nemo_logging:393] Changed tokenizer to ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '##i', '##n', '##e', '##t', '##y', '##w', '##o', '##x', '##h', '##r', '##u', '##s', '##a', '##d', '##v', '##g', '##b', '##c', '##p', '##m', '##l', '##f', '##ne', '##ve', 'tw', 'one', 'two', '##ty', 'fi', '##ou', 'si', 'six', 'five'] vocabulary.


In [16]:
params['model']['train_ds']["batch_size"] = 8

In [17]:
restored_model.setup_optimization(optim_config=new_opt)
restored_model.setup_training_data(train_data_config=params['model']['train_ds'])
restored_model.setup_validation_data(val_data_config=params['model']['validation_ds'])

# # Freeze the encoder layers (should not be done for finetuning, only done for demo)
restored_model.encoder.freeze()

[NeMo W 2025-07-19 09:31:33 nemo_logging:405] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2025-07-19 09:31:33 nemo_logging:393] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.999]
        capturable: False
        decoupled_weight_decay: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.1
        maximize: False
        weight_decay: 0.0001
    )


[NeMo W 2025-07-19 09:31:33 nemo_logging:405] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


[NeMo I 2025-07-19 09:31:33 nemo_logging:393] Dataset loaded with 948 files totalling 0.71 hours
[NeMo I 2025-07-19 09:31:33 nemo_logging:393] 0 files were filtered totalling 0.00 hours
[NeMo I 2025-07-19 09:31:33 nemo_logging:393] Dataset loaded with 130 files totalling 0.10 hours
[NeMo I 2025-07-19 09:31:33 nemo_logging:393] 0 files were filtered totalling 0.00 hours


In [18]:
trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=20)
trainer.fit(restored_model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2025-07-19 09:31:34 nemo_logging:393] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.999]
        capturable: False
        decoupled_weight_decay: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.1
        maximize: False
        weight_decay: 0.0001
    )
[NeMo I 2025-07-19 09:31:34 nemo_logging:393] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f84344c66c0>" 
    will be used during training (effective maximum steps = 2380) - 
    Parameters : 
    (warmup_steps: null
    warmup_ratio: 0.05
    min_lr: 1.0e-06
    last_epoch: -1
    max_steps: 2380
    )



  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | ConvASREncoder                    | 1.2 M  | eval 
2 | spec_augmentation | SpectrogramAugmentation           | 0      | train
3 | wer               | WER                               | 0      | train
4 | decoder           | ConvASRDecoder                    | 66.6 K | train
5 | loss              | CTCLoss                           | 0      | train
--------------------------------------------------------------------------------
66.6 K    Trainable params
1.2 M     Non-trainable params
1.2 M     Total params
4.983     Total estimated model params size (MB)
8         Modules in train mode
126       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.
