Install wandb to  support logging all the metrices while training.

In [1]:
pip install wandb

Note: you may need to restart the kernel to use updated packages.


Create a file to store wandb api in a file named 'api.txt'

In [None]:
# Define the filename and the text to be written
filename = 'api.txt'
text_to_write = "502da9cf7b700db4af3b8db0979a771fff1634b8"
# Open the file in write mode
with open(filename, 'w') as file:
    # Write the text to the file
    file.write(text_to_write)

print(f"'{filename}' has been created and written with the specified text.")


Login to your wandb account.

In [None]:
import wandb

wandb.login("your_api_key_here")


Read the API key from 'api.txt'

In [4]:
import os
with open("api.txt") as f:
    os.environ["WANDB_API_KEY"] = f.read().strip()

Clone the repository to your local machine

In [5]:
!git clone https://github.com/NVIDIA/NeMo.git 

fatal: destination path 'NeMo' already exists and is not an empty directory.


Install other necessary libraries

In [6]:
pip install pybind11

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install wheel setuptools pip --upgrade

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install fasttext

Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install nemo_toolkit['asr']==2.1.0



In [10]:
import nemo

print(nemo.__version__)


2.1.0


Some files from NeMo are rewritten here because NeMo wasn't able to read them from the local NeMo folder.

In [11]:
file_path = "NeMo/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py"

# The new content to write to the file
new_content = '''
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
# Preparing the Tokenizer for the dataset
Use the `process_asr_text_tokenizer.py` script under <NEMO_ROOT>/scripts/tokenizers/ in order to prepare the tokenizer.

```sh
python <NEMO_ROOT>/scripts/tokenizers/process_asr_text_tokenizer.py \
        --manifest=<path to train manifest files, seperated by commas>
        OR
        --data_file=<path to text data, seperated by commas> \
        --data_root="<output directory>" \
        --vocab_size=<number of tokens in vocabulary> \
        --tokenizer=<"spe" or "wpe"> \
        --no_lower_case \
        --spe_type=<"unigram", "bpe", "char" or "word"> \
        --spe_character_coverage=1.0 \
        --log
```

# Training the model
```sh
python speech_to_text_ctc_bpe.py \
    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
    model.train_ds.manifest_filepath=<path to train manifest> \
    model.validation_ds.manifest_filepath=<path to val/test manifest> \
    model.tokenizer.dir=<path to directory of tokenizer (not full path to the vocab file!)> \
    model.tokenizer.type=<either bpe or wpe> \
    trainer.devices=-1 \
    trainer.accelerator="gpu" \
    trainer.strategy="ddu"
    trainer.max_epochs=100 \
    model.optim.name="adamw" \
    model.optim.lr=0.001 \
    model.optim.betas=[0.9,0.999] \
    model.optim.weight_decay=0.0001 \
    model.optim.sched.warmup_steps=2000
    exp_manager.create_wandb_logger=True \
    exp_manager.wandb_logger_kwargs.name="<Name of experiment>" \
    exp_manager.wandb_logger_kwargs.project="<Name of project>"
```

# Fine-tune a model

For documentation on fine-tuning this model, please visit -
https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations

# Pretrained Models

For documentation on existing pretrained models, please visit -
https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html

"""

import lightning.pytorch as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager

import sys
import importlib.util

# Path to trainer_utils.py
file_path = "NeMo/nemo/utils/trainer_utils.py"

# Load the module from the file path
spec = importlib.util.spec_from_file_location("trainer_utils", file_path)
trainer_utils = importlib.util.module_from_spec(spec)
spec.loader.exec_module(trainer_utils)

# Now you can use resolve_trainer_cfg
resolve_trainer_cfg = trainer_utils.resolve_trainer_cfg

@hydra_runner(config_path="../conf/citrinet/", config_name="config_bpe")
def main(cfg):
    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)

    # Initialize the weights of the model from another model, if provided via config
    asr_model.maybe_init_from_pretrained_checkpoint(cfg)

    trainer.fit(asr_model)

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        if asr_model.prepare_test(trainer):
            trainer.test(asr_model)


if __name__ == '__main__':
    main()

'''

# Open the file in write mode and overwrite the content
with open(file_path, "w") as file:
    file.write(new_content)
    
print(f"File {file_path} has been updated.")

File NeMo/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py has been updated.


Set the config.yaml file as it eases the process of trying different values for different parameters.

In [13]:
import yaml

config = """
name: "Conformer-CTC-BPE"

model:
  sample_rate: 16000
  log_prediction: true
  ctc_reduction: 'mean_batch'
  skip_nan_grad: false

  train_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    batch_size: 16
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 16.7
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    batch_size: 16
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  test_ds:
    manifest_filepath: null
    sample_rate: ${model.sample_rate}
    batch_size: 16
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  tokenizer:
    dir: ???
    type: bpe

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: ${model.sample_rate}
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    log: true
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0
    pad_value: 0.0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 2
    time_masks: 10
    freq_width: 27
    time_width: 0.05

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1
    n_layers: 18
    d_model: 256
    subsampling: striding
    subsampling_factor: 4
    subsampling_conv_channels: -1
    causal_downsampling: false
    ff_expansion_factor: 4
    self_attention_model: rel_pos
    n_heads: 4
    att_context_size: [-1, -1]
    att_context_style: regular
    xscaling: true
    untie_biases: true
    pos_emb_max_len: 5000
    conv_kernel_size: 31
    conv_norm_type: 'batch_norm'
    conv_context_size: null
    dropout: 0.1
    dropout_pre_encoder: 0.1
    dropout_emb: 0.0
    dropout_att: 0.1
    stochastic_depth_drop_prob: 0.0
    stochastic_depth_mode: linear
    stochastic_depth_start_layer: 1

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: null
    num_classes: -1
    vocabulary: []

  interctc:
    loss_weights: []
    apply_at_layers: []

  optim:
    name: adamw
    lr: 0.4
    betas: [0.9, 0.98]
    weight_decay: 1e-3
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      warmup_steps: 10000
      warmup_ratio: null
      min_lr: 1e-6

trainer:
  devices: -1
  num_nodes: 1
  max_epochs: 50
  max_steps: -1
  val_check_interval: 1.0
  accelerator: auto
  
  accumulate_grad_batches: 1
  gradient_clip_val: 0.0
  precision: 32
  log_every_n_steps: 10
  enable_progress_bar: True
  num_sanity_val_steps: 0
  check_val_every_n_epoch: 1
  sync_batchnorm: true
  enable_checkpointing: False
  logger: false
  benchmark: false

exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
    save_top_k: 5
    always_save_nemo: True
  
  resume_if_exists: false
  resume_ignore_no_checkpoint: false
  create_wandb_logger: True
  wandb_logger_kwargs:
    name: "lr-0.4"
    project: "Conformer0.4_FULL"
 
"""

with open('config.yaml', 'w') as file:
    file.write(config)

print("config.yaml file has been created.")


config.yaml file has been created.


In [16]:
code = '''
import argparse
import json
import logging
import os
from typing import List, Optional

import tokenizers

from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
from nemo.utils.data_utils import DataStoreObject

parser = argparse.ArgumentParser(description='Create tokenizer')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--manifest", default=None, type=str, help='Comma separated list of manifest files')
group.add_argument("--data_file", default=None, help='data file from which to create tokenizer model')
parser.add_argument("--data_root", required=True, default=None, type=str, help='Output directory')
parser.add_argument("--vocab_size", default=1024, type=int, help='Vocabulary size')
parser.add_argument("--tokenizer", default="wpe", choices=["spe", "wpe"], help='Type of tokenization to perform')
parser.add_argument(
    "--spe_type",
    default="bpe",
    choices=['bpe', 'unigram', 'char', 'word'],
    help='Type of the SentencePiece model. Can be `bpe`, `unigram`, `char` or `word`.'
    'Used only if --tokenizer == `spe`',
)
parser.add_argument(
    '--spe_character_coverage',
    type=float,
    default=1.0,
    help="Character coverage percentage for SentencePiece tokenization. For languages "
    "with large vocabulary, should be close to 0.9995, otherwise kept as 1.0",
)
parser.add_argument('--spe_bos', action='store_true', help='Add <s> token to SentencePiece Tokenizer.')
parser.add_argument('--spe_eos', action='store_true', help='Add </s> token to SentencePiece Tokenizer.')
parser.add_argument('--spe_pad', action='store_true', help='Add <pad> token to SentencePiece Tokenizer.')
parser.add_argument(
    '--spe_user_defined_symbols', default=None, type=str, nargs='+', help='User defined symbols for SentencePiece'
)
parser.add_argument(
    '--spe_control_symbols', default=None, type=str, nargs='+', help='Control symbols for SentencePiece'
)
parser.add_argument('--spe_split_digits', action='store_true', help='Split digits into separate tokens.')
parser.add_argument(
    '--spe_remove_extra_whitespaces',
    action='store_true',
    help='Remove leading, trailing, and duplicate internal whitespace.',
)

parser.add_argument(
    '--spe_sample_size',
    type=int,
    default=-1,
    help="Samples the dataset by `sample_size` if positive integer, otherwise uses whole dataset",
)
parser.add_argument('--spe_train_extremely_large_corpus', action='store_true', help='')
parser.add_argument(
    '--spe_max_sentencepiece_length',
    type=int,
    default=-1,
    help='Limit the maximum number of tokens in each SentencePiece subword. '
    'Must be a positive integer > 0. By default places no limit on subword length.',
)
parser.add_argument(
    '--spe_no_split_by_unicode_script',
    dest='spe_split_by_unicode_script',
    action='store_false',
    help="Don't use Unicode script to split sentence pieces.",
)
parser.add_argument(
    '--spe_byte_fallback',
    dest='spe_byte_fallback',
    action='store_true',
    help="If <unk>, fallback to a byte sequence of the characters.",
)
parser.add_argument('--no_lower_case', dest='lower_case', action='store_false')
parser.add_argument("--log", action='store_true')
parser.set_defaults(log=False, lower_case=True, spe_train_extremely_large_corpus=False)
args = parser.parse_args()


def __build_document_from_manifests(
    data_root: str,
    manifests: str,
):
    if ',' in manifests:
        manifests = manifests.split(',')
    else:
        manifests = [manifests]

    document_dir = os.path.join(data_root, 'text_corpus')
    if not os.path.exists(document_dir):
        os.makedirs(document_dir)

    document_path = os.path.join(document_dir, 'document.txt')

    if os.path.exists(document_path):
        logging.info('Corpus already exists at path : %s', document_path)
        return document_path

    num_lines = 0
    with open(document_path, 'w') as out_writer:
        for manifest in manifests:
            with open(DataStoreObject(manifest).get(), 'r') as in_reader:
                for line in in_reader:
                    item = json.loads(line)
                    text = item['text']

                    out_writer.write(text + '\\n')
                    out_writer.flush()

                    num_lines += 1

            logging.info(f"Finished extracting manifest : {manifest}")

        logging.info("Finished extracting all manifests ! Number of sentences : {}".format(num_lines))
    return document_path


def __process_data(
    text_path: str,
    dst_folder: str,
    vocab_size: int,
    tokenizer_type: str,
    spe_type: str,
    spe_character_coverage: float,
    spe_train_extremely_large_corpus: bool,
    spe_sample_size: int,
    spe_max_sentencepiece_length: int,
    spe_split_by_unicode_script: bool,
    spe_bos: bool,
    spe_eos: bool,
    spe_pad: bool,
    spe_control_symbols: Optional[List[str]],
    spe_user_defined_symbols: Optional[List[str]],
    spe_byte_fallback: bool,
    spe_split_digits: bool,
    spe_remove_extra_whitespaces: bool,
    lower_case: bool,
):
    """
    Converts flac to wav and build manifests's json
    Args:
        text_path: source with text lines
        dst_folder: where wav files will be stored
        vocab_size: vocabular size used in encoding the text
        tokenizer_type: type of tokenization to perform - wpe or spe
        spe_type: type of tokenization model used for spe.
        spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
            can be < 1.0, but for all other languages, it should be set as 1.0
        spe_sample_size: int, default of -1. If positive integer is used, samples the dataset
            by given sample size.
        spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM,
            this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM.
        spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
            By default, no limit is placed.
        spe_bos: Bool flag, whether to add <s> to SentencePiece tokenizer vocabulary.
        spe_eos: Bool flag, whether to add </s> to SentencePiece tokenizer vocabulary.
        spe_pad: Bool flag, whether to add <pad> to SentencePiece tokenizer vocabulary.
        spe_control_symbols: control symbols to add to tokenizer, as defined by sentencepiece.
            These tokens get removed at decode time and are not encoded from the text - can only be added to the input programatically.
        spe_user_defined_symbols: user symbols to add to tokenizer, as defined by sentencepiece.
            These tokens remain in the decoded text and are encoded automatically when present in the input text.
        spe_byte_fallback: If <unk>, fallback to a byte sequence of the character.
        spe_split_digits: If true, digits are split into individual tokens.
        spe_remove_extra_whitespaces: If true, removes leading, trailing, and duplicate internal whitespace.
        lower_case: whether to tokenize with lower case character set only (for english)
    Returns:
    """
    if tokenizer_type == 'spe':

        # Prepare directory of tokenizer
        if spe_max_sentencepiece_length > 0:
            tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}_max_{}').format(
                tokenizer_type, spe_type, vocab_size, spe_max_sentencepiece_length
            )
        else:
            tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format(
                tokenizer_type, spe_type, vocab_size
            )

        if spe_pad:
            tokenizer_dir = f'{tokenizer_dir}_pad'
        if spe_bos:
            tokenizer_dir = f'{tokenizer_dir}_bos'
        if spe_eos:
            tokenizer_dir = f'{tokenizer_dir}_eos'

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
            logging.warning("Model file already exists, overriding old model file !")
            os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))

        # Build tokenizer
        tokenizer_path, vocab_path = create_spt_model(
            data_file=text_path,
            vocab_size=vocab_size,
            sample_size=spe_sample_size,
            do_lower_case=lower_case,
            output_dir=tokenizer_dir,
            tokenizer_type=spe_type,
            character_coverage=spe_character_coverage,
            train_extremely_large_corpus=spe_train_extremely_large_corpus,
            max_sentencepiece_length=spe_max_sentencepiece_length,
            split_by_unicode_script=spe_split_by_unicode_script,
            bos=spe_bos,
            eos=spe_eos,
            pad=spe_pad,
            control_symbols=spe_control_symbols,
            user_defined_symbols=spe_user_defined_symbols,
            byte_fallback=spe_byte_fallback,
            split_digits=spe_split_digits,
            
        )

    else:
        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(tokenizer_type, vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case)

        tokenizer.train(text_path, vocab_size=vocab_size)
        tokenizer.save_model(tokenizer_dir)

    return tokenizer_dir


def main():
    data_root = args.data_root
    manifests = args.manifest
    data_file = args.data_file
    vocab_size = args.vocab_size
    tokenizer = args.tokenizer
    spe_type = args.spe_type
    spe_character_coverage = args.spe_character_coverage
    spe_sample_size = args.spe_sample_size
    spe_train_extremely_large_corpus = args.spe_train_extremely_large_corpus
    spe_max_sentencepiece_length = args.spe_max_sentencepiece_length
    spe_split_by_unicode_script = args.spe_split_by_unicode_script
    spe_bos, spe_eos, spe_pad = args.spe_bos, args.spe_eos, args.spe_pad
    spe_control_symbols = args.spe_control_symbols
    spe_user_defined_symbols = args.spe_user_defined_symbols
    spe_byte_fallback = args.spe_byte_fallback
    spe_split_digits = args.spe_split_digits
    spe_remove_extra_whitespaces = args.spe_remove_extra_whitespaces
    lower_case = args.lower_case

    # Get the data
    if not data_file:
        document_path = __build_document_from_manifests(
            data_root=data_root,
            manifests=manifests
        )
    else:
        document_path = data_file

    # Process and create the tokenizer
    tokenizer_dir = __process_data(
        text_path=document_path,
        dst_folder=data_root,
        vocab_size=vocab_size,
        tokenizer_type=tokenizer,
        spe_type=spe_type,
        spe_character_coverage=spe_character_coverage,
        spe_train_extremely_large_corpus=spe_train_extremely_large_corpus,
        spe_sample_size=spe_sample_size,
        spe_max_sentencepiece_length=spe_max_sentencepiece_length,
        spe_split_by_unicode_script=spe_split_by_unicode_script,
        spe_bos=spe_bos,
        spe_eos=spe_eos,
        spe_pad=spe_pad,
        spe_control_symbols=spe_control_symbols,
        spe_user_defined_symbols=spe_user_defined_symbols,
        spe_byte_fallback=spe_byte_fallback,
        spe_split_digits=spe_split_digits,
        spe_remove_extra_whitespaces=spe_remove_extra_whitespaces,
        lower_case=lower_case,
    )
    logging.info('Created tokenizer model in directory : %s', tokenizer_dir)


if __name__ == "__main__":
    main()
'''

with open('NeMo/scripts/tokenizers/process_asr_text_tokenizer.py', 'w') as f:
    f.write(code)


In [18]:

file_path = "NeMo/nemo/collections/common/tokenizers/canary_tokenizer.py"

# The new content to write to the file
new_content = '''
# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
from functools import cached_property
from pathlib import Path
from typing import Dict, List

from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer
from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model

from nemo.utils import logging

__all__ = ['CanaryTokenizer']

# Default tokens for compatibility with Canary.
CANARY_BOS = "<|startoftranscript|>"
CANARY_EOS = "<|endoftext|>"
CANARY_PAD = "<pad>"
CANARY_NOSPEECH = "<|nospeech|>"
CANARY_PNC = "<|pnc|>"
CANARY_NOPNC = "<|nopnc|>"
CANARY2_BOCTX = "<|startofcontext|>"
DEFAULT_TOKENS = [CANARY_NOSPEECH, CANARY_PAD, CANARY_EOS, CANARY_BOS, CANARY_PNC, CANARY_NOPNC]

CANARY_SPECIAL_TOKENIZER = "spl_tokens"


class CanaryTokenizer(AggregateTokenizer):
    """
    Thin wrapper around AggregateTokenizer to provide quick access to special tokens
    """

    def __init__(self, tokenizers: Dict):
        super().__init__(tokenizers)

        # for easy access of special tokens
        self.special_tokens = {}
        for special in tokenizers[CANARY_SPECIAL_TOKENIZER].vocab:
            # Search for special prompting tokens
            if (special.startswith("<|") and special.endswith("|>")) or special == CANARY_PAD:
                self.special_tokens[special] = self.token_to_id(special, lang_id=CANARY_SPECIAL_TOKENIZER)

    @cached_property
    def eos_id(self) -> int:
        return self.special_tokens[CANARY_EOS]

    @cached_property
    def bos_id(self) -> int:
        return self.special_tokens[CANARY_BOS]

    @cached_property
    def nospeech_id(self) -> int:
        return self.special_tokens[CANARY_NOSPEECH]

    @cached_property
    def pad_id(self) -> int:
        return self.special_tokens[CANARY_PAD]

    def _text_with_timestamps_to_ids(self, text_without_timestamps, time_text, lang_id) -> list[int]:
        trans_words = text_without_timestamps.split()

        # Get timestamp ids
        time_ids = self._tokenize_special_prompt(time_text)

        # Tokenize text word by wordd
        word_ids = []
        result_ids = []
        time_index = 0

        timestamp_every_n_words = 1  # Add timestmap for every N words
        word_index = 0
        # Both start and end time
        for word in trans_words:
            # Insert the first time_id once
            if word_index == 0 and time_index < len(time_ids):
                result_ids.append(time_ids[time_index])
                time_index += 1
            # Tokenize the word
            word_ids += super().text_to_ids(word, lang_id)
            result_ids += super().text_to_ids(word, lang_id)
            word_index += 1
            # Insert time ids every N words after the first one
            if word_index % timestamp_every_n_words == 0 and word_index != 0 and time_index < len(time_ids):
                result_ids.append(time_ids[time_index])
                time_index += 1
                if time_index < len(time_ids):
                    result_ids.append(time_ids[time_index])
                    time_index += 1
            else:
                time_index += 2
        # Ensure the last time_id is appended at the end
        if time_index < len(time_ids):
            result_ids.append(time_ids[-1])
        # Make sure the last time_id is appended only once
        if time_index < len(time_ids) and result_ids[-1] != (time_ids[-1]):
            result_ids.append(time_ids[-1])
        return result_ids

    def _text_to_ids_maybe_with_timestamps(self, text_no_eos, lang_id) -> list[int]:
        time_pattern = re.compile(r"<\|\d+\|>")
        time_text = "".join(time_pattern.findall(text_no_eos))
        has_timestamp = bool(time_text)
        if not has_timestamp:
            return super().text_to_ids(text_no_eos, lang_id)
        else:
            text_without_timestamps = time_pattern.sub("", text_no_eos).strip()
            return self._text_with_timestamps_to_ids(text_without_timestamps, time_text, lang_id)

    def text_to_ids(self, text, lang_id) -> list[int]:
        if lang_id == CANARY_SPECIAL_TOKENIZER:
            return self._tokenize_special_prompt(text)
        lang_id = _map_canary1_to_canary2_lang(lang_id, self.langs)
        if text.endswith(CANARY_EOS):
            return self._text_to_ids_maybe_with_timestamps(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
        return self._text_to_ids_maybe_with_timestamps(text, lang_id)

    def _tokenize_special_prompt(self, text: str) -> list[int]:
        """
        Tokenize the input special prompt of Canary family of models.

        Required because otherwise self.text_to_ids() returns a different result than what Canary had been trained with.
        """
        ans = []

        if text.startswith(CANARY2_BOCTX):
            # Canary 2 prompt format. It starts with decoder context, which should be tokenized using
            # a different tokenizer than spl_tokens. We don't really know what it is, so we'll use the
            # following HACK solution: look up 5th token which is target_lang and tokenize this part
            # using its tokenizer. We skip this when decoder context is empty.
            ans.append(self.special_tokens[CANARY2_BOCTX])
            text = text[len(CANARY2_BOCTX) :]
            ctx_end_idx = text.find(CANARY_BOS)
            if decoder_ctx := text[:ctx_end_idx]:
                target_lang = text.split("<|")[4].replace("|>", "")  # sorry
                ans.extend(self.text_to_ids(decoder_ctx, target_lang))
                text = text[ctx_end_idx:]

        num_special_tokens = text.count(">")
        for _ in range(num_special_tokens):
            token = text[: text.find(">") + 1]
            ans.append(self.special_tokens[token])
            text = text[len(token) :]
        assert len(text) == 0, text
        return ans

    def spl_token_to_id(self, token):
        if token_id := self.special_tokens.get(f"<|{token}|>", None):
            return token_id
        raise KeyError(f"Token {token} not found in tokenizer.")
    from typing import Union
    @staticmethod
    def build_special_tokenizer(
        tokens: List[str], model_dir: Union[str, Path], force_rebuild: bool = False
    ) -> SentencePieceTokenizer:
        if force_rebuild:
            logging.info("Building special tokenizer")
            # Checks for artifacts of previous build.
            for file in ["tokenizer.model", "tokenizer.vocab", "vocab.txt", "train_text.txt"]:
                if os.path.exists(file):
                    os.remove(file)
        spl_tok_re = re.compile(r"<\|.+\|>")
        tokens = DEFAULT_TOKENS + [f"<|{t}|>" if spl_tok_re.match(t) is None else t for t in tokens]
        tokens = list(dict.fromkeys(tokens))  # remove duplicates while preserving order
        output_dir = Path(model_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        text_path = output_dir / "train_text.txt"
        train_text = "\n".join(tokens)
        text_path.write_text(train_text)
        model_path = output_dir / "tokenizer.model"
        create_spt_model(
            str(text_path),
            vocab_size=len(tokens) + 2,
            sample_size=-1,
            do_lower_case=False,
            output_dir=str(output_dir),
            user_defined_symbols=tokens,
        )
        spl_tokenizer = SentencePieceTokenizer(str(model_path))
        return spl_tokenizer


def _map_canary1_to_canary2_lang(lang: str, available_langs: list[str]) -> str:
    if len(lang) != 2 or lang in available_langs:
        return lang

    if (
        mapped := {"en": "en-US", "es": "es-ES", "fr": "fr-FR", "de": "de-DE"}.get(lang)
    ) is not None and mapped in available_langs:
        return mapped

    raise RuntimeError(f"Unsupported language: '{lang}' for CanaryTokenizer with languages: {available_langs}")
'''

# Open the file in write mode and overwrite the content
with open(file_path, "w") as file:
    file.write(new_content)
    
print(f"File {file_path} has been updated.")

File NeMo/nemo/collections/common/tokenizers/canary_tokenizer.py has been updated.


  new_content = '''


Tokenize the data using process_asr_text_tokenizer.py from NeMo

In [19]:
!python NeMo/scripts/tokenizers/process_asr_text_tokenizer.py \
    --manifest "dataset/manifest_labels_train.json" \
    --data_root "tokens" \
    --vocab_size 128 \
    --tokenizer "spe" \
    --spe_type "unigram" \
    --spe_max_sentencepiece_length 200 \
    --spe_character_coverage 1.0 \
    --log


[NeMo I 2025-01-10 10:06:32 nemo_logging:393] Processing tokens/text_corpus/document.txt and store at tokens/tokenizer_spe_unigram_v128_max_200
sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=tokens/text_corpus/document.txt --model_prefix=tokens/tokenizer_spe_unigram_v128_max_200/tokenizer --vocab_size=128 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=unigram --character_coverage=1.0 --bos_id=-1 --eos_id=-1 --normalization_rule_name=nmt_nfkc_cf --max_sentencepiece_length=200 --remove_extra_whitespaces=false
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: tokens/text_corpus/document.txt
  input_format: 
  model_prefix: tokens/tokenizer_spe_unigram_v128_max_200/tokenizer
  model_type: UNIGRAM
  vocab_size: 128
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_thread

In [22]:
pip install numpy==1.26.0


Note: you may need to restart the kernel to use updated packages.


Train the model

In [None]:
!python NeMo/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
--config-path="/home/jenny/conformer/cofig" --config-name="config" \
model.train_ds.manifest_filepath="dataset/manifest_labels_train.json" \
model.validation_ds.manifest_filepath="dataset/manifest_labels_val.json" \
model.test_ds.manifest_filepath="dataset/manifest_remaining_test.json" \
model.tokenizer.dir="tokens/tokenizer_spe_unigram_v128_max_200"


    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    
[NeMo I 2025-01-10 10:06:44 nemo_logging:393] Hydra config: name: Conformer-CTC-BPE
    model:
      sample_rate: 16000
      log_prediction: true
      ctc_reduction: mean_batch
      skip_nan_grad: false
      train_ds:
        manifest_filepath: dataset/manifest_labels_train.json
        sample_rate: ${model.sample_rate}
        batch_size: 16
        shuffle: true
        num_workers: 8
        pin_memory: true
        max_duration: 16.7
        min_duration: 0.1
        is_tarred: false
        tarred_audio_filepaths: null
        shuffle_n: 2048
        bucketing_strategy: synced_randomized
        bucketing_batch_size: null
      validation_ds:
        manifest_filepath: dataset/manifest_labels_val.json
        sample_rate: ${model.sample_rate}
        batch_size: 16
        shuffle: false
        use_start_end_token: false
        num_workers: 8
    