In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'r1.17.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

Mount google drive (optional)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Install huggingface datasets package to load ML Commons People's Speech data

In [None]:
pip install datasets

Load in some relevant packages

In [None]:
# NeMo's "core" package
import nemo
# NeMo's ASR collection - this collections contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr
import os
from datasets import load_dataset
import librosa
import IPython.display as ipd
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

Load in People's Speech data, specifically the test split which is the smallest for convenient downloads with colab

In [None]:
dataset = load_dataset('MLCommons/peoples_speech', name='test')


Optionally print the dataset, an example audio file path, and an example transcription

In [None]:
print(dataset, '\n\n')
ex_filepath = dataset['test'][0]['audio']['path']
ex_text = dataset['test'][0]['text']
print(ex_filepath, '\n\n')
print(ex_text)

Optional - load in one audio example and listen

In [None]:
import librosa
import IPython.display as ipd

# Load and listen to the audio file
audio, sample_rate = librosa.load(ex_filepath)

ipd.Audio(ex_filepath, rate=sample_rate)

Optional - plot waveform of audio example showing signal amplitude vs time

In [None]:
%matplotlib inline
import librosa.display
import matplotlib.pyplot as plt

# Plot our example audio file's waveform
plt.rcParams['figure.figsize'] = (15,7)
plt.title('Waveform of Audio Example')
plt.ylabel('Amplitude')

_ = librosa.display.waveshow(audio)

Optional - example of a "standard" spectrogram... without Mel frequency transformation

In [None]:
import numpy as np

# Get spectrogram using Librosa's Short-Time Fourier Transform (stft)
spec = np.abs(librosa.stft(audio))
spec_db = librosa.amplitude_to_db(spec, ref=np.max)  # Decibels

# Use log scale to view frequencies
librosa.display.specshow(spec_db, y_axis='log', x_axis='time')
plt.colorbar()
plt.title('Audio Spectrogram');

Optional - example of the same audio signal transformed to a Mel spectrogram. Mel spectrogram transforms the frequencies in a non-linear fashion to create a new signal which better represents how humans perceive sounds.

In [None]:
# Plot the mel spectrogram of our sample
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

librosa.display.specshow(
    mel_spec_db, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title('Mel Spectrogram');

**Take a subset of the test split to keep training times down.**

Directly below was used for most training.. 2400 data points for training and 600 for validation.

In [None]:
# train_frac = 0.8 # fraction of data to use for training
# val_fac = 1 - train_frac # validation split
# n_subset = 3000 # rows of data to use for training
# subset_data = dataset['test'].select(range(n_subset))

# nrows = subset_data.num_rows
# ntrain = int(round(nrows*train_frac,0))
# nval = nrows - ntrain
# print(f'n training samples: {ntrain}')
# print(f'n validation samples: {nval}')

# train_dataset = subset_data.select(range(0, ntrain))
# val_dataset = subset_data.select(range(ntrain, nrows))

# # print(val_dataset)

Below code block will keep same validation data as above... rows 2400:3000, while increasing training data. This was used to increase the training data size later in experimentation

In [None]:
val_row_start = 2400 # same as with above code
val_row_end = 3000 # same as with above code
ntrain = val_row_start*6 # rows of data to use for training
train_row1 = 0
train_row_end1 = val_row_start
train_row2 = val_row_end
train_row_end2 = ntrain - val_row_start + val_row_end
nval = val_row_end - val_row_start
n_subset = ntrain + nval

subset_data = dataset['test'].select(range(n_subset))

nrows = subset_data.num_rows

print(f'n training samples: {ntrain}')
print(f'n validation samples: {nval}')

train_rows1 = range(train_row1, train_row_end1)
train_rows2 = range(train_row2, train_row_end2)
train_rows = list(train_rows1)
train_rows.extend(list(train_rows2))
val_rows = range(val_row_start, val_row_end)

train_dataset = subset_data.select(train_rows)
val_dataset = subset_data.select(val_rows)

# print(val_dataset)

**Function for manifest creation**

Define function to create a manifest.json file for the NeMo models. NeMo models use this to retrieve data for training, validation, etc. The file consists of audio file paths, audio duration, and ground-truth transcriptions

In [None]:
import json
import os


def build_manifest(dataset, manifest_path='./../data/', split_name='test', duration_unit='ms'):
    out_path = os.path.join(manifest_path, split_name+'_manifest.json')
    
    if not(os.path.exists(out_path)):
        with open(out_path, 'w') as f:
            for i,example in enumerate(dataset):
                n = dataset.num_rows
                if i%int(n/5) == 0:
                    print(f'processing sample {i} of {n} ({round(i/n*100,2)}%)')
                
                audio_path = example['audio']['path']
                transcript = example['text']
                
                assert duration_unit in ['ms','s'], '{duraction_unit should be either "ms" for millseconds or "s" for seconds}'
                if duration_unit=='ms':
                    div_by = 1000
                else:
                    div_by = 1
                metadata = {
                    'audio_filepath': audio_path,
                    'duration': example['duration_ms']/div_by,
                    'text': transcript
                }
                json.dump(metadata, f)
                f.write('\n')
    return out_path

get training and validation data and manifests

In [None]:
new_path = '.'
train_manifest = build_manifest(train_dataset, manifest_path=new_path, split_name='train')
val_manifest = build_manifest(val_dataset, manifest_path=new_path, split_name='val')

## Optionally prepare a tokenizer

Note - default tokenizer for pre-trained models was used in fine-tuning

In [None]:
if not os.path.exists("scripts/process_asr_text_tokenizer.py"):
  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py

In [None]:
VOCAB_SIZE = 100  # can be any value above 29
TOKENIZER_TYPE = "spe"  # can be wpe or spe
SPE_TYPE = "unigram"  # can be bpe or unigram

# ------------------------------------------------------------------- #
!rm -r tokenizers/

if not os.path.exists("tokenizers"):
  os.makedirs("tokenizers")

!python scripts/process_asr_text_tokenizer.py \
   --manifest=$train_manifest \
   --data_root="tokenizers" \
   --tokenizer=$TOKENIZER_TYPE \
   --spe_type=$SPE_TYPE \
   --no_lower_case \
   --log \
   --vocab_size=$VOCAB_SIZE

In [None]:
# Tokenizer path
if TOKENIZER_TYPE == 'spe':
  TOKENIZER = os.path.join("tokenizers", f"tokenizer_spe_{SPE_TYPE}_v{VOCAB_SIZE}")
  TOKENIZER_TYPE_CFG = "bpe"
else:
  TOKENIZER = os.path.join("tokenizers", f"tokenizer_wpe_v{VOCAB_SIZE}")
  TOKENIZER_TYPE_CFG = "wpe"

**Load in some more relevant packages and load in pre-trained model**

In [None]:
from omegaconf import DictConfig, OmegaConf, open_dict
from nemo.utils import logging, exp_manager
from nemo.collections.asr.models import EncDecRNNTBPEModel

asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="stt_en_conformer_transducer_small")

**Modify the pre-trained model configuration/hyperparameters**

In [None]:
import copy
cfg = copy.deepcopy(asr_model.cfg)

In [None]:
#@title Freeze Encoder { display-mode: "form" }
freeze_encoder = True #@param ["False", "True"] {type:"raw"}
freeze_encoder = bool(freeze_encoder)

In [None]:
if freeze_encoder:
  asr_model.encoder.freeze()
  logging.info("Model encoder has been frozen")
else:
  asr_model.encoder.unfreeze()
  logging.info("Model encoder has been un-frozen")

Fused batches is unique to transducers and can help with memory consumption

In [None]:
asr_model.cfg.joint.experimental_fuse_loss_wer = True
asr_model.cfg.joint.fused_batch_size = 8

Modify training, validation, and test dataset configurations

In [None]:
with open_dict(asr_model.cfg):    
  # Train dataset
  cfg.train_ds.manifest_filepath = train_manifest
  cfg.train_ds.is_tarred = False
  cfg.train_ds.tarred_audio_filepaths = None
  cfg.train_ds.batch_size = 32
  cfg.train_ds.num_workers = 2
  cfg.train_ds.pin_memory = True
  cfg.train_ds.trim_silence = True

  # validation
  cfg.validation_ds.manifest_filepath = val_manifest
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 2
  cfg.validation_ds.pin_memory = True
  # cfg.validation_ds.trim_silence = True

  # test
  cfg.test_ds.manifest_filepath = None

In [None]:
# setup data loaders with new configs
asr_model.setup_training_data(cfg.train_ds)
asr_model.setup_validation_data(cfg.validation_ds)
asr_model.setup_test_data(cfg.test_ds)

Modify the optimization and decoding configurations

In [None]:
with open_dict(asr_model.cfg.optim):
  asr_model.cfg.optim.lr = 0.001
  asr_model.cfg.optim.betas = [0.95, 0.5]  # from paper
  asr_model.cfg.optim.weight_decay = 0.001  # Original weight decay
  asr_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  asr_model.cfg.optim.sched.warmup_ratio = 0.05  # 5 % warmup
  asr_model.cfg.optim.sched.min_lr = 1e-5

  #asr_model.cfg.decoding.greedy.max_symbols = 5
  #asr_model.cfg.decoder.prednet.dropout = 0.1
  #asr_model.cfg.joint.jointnet.dropout = 0.1
  asr_model.cfg.decoding.strategy = "beam"
  asr_model.cfg.decoding.beam.beam_size = 5

Setup a pytorch lightning trainer

In [None]:
import pytorch_lightning as pl
import torch

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'cpu'

epochs = 20

trainer = pl.Trainer(devices=1, max_epochs=epochs, accelerator=accelerator,
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=False,
                      log_every_n_steps=10,
                      check_val_every_n_epoch=1,
                     precision=32,
                     num_sanity_val_steps=0)

asr_model.set_trainer(trainer)

# update internal config
asr_model.cfg = asr_model._cfg

Optional to check that the model configuration was updated properly

In [None]:
# asr_model.cfg

setup tensorboard logger to review learning curves, etc, after training is complete

In [None]:
exp_name = "ASR-conformer-transd"
config = exp_manager.ExpManagerConfig(
    exp_dir='./nemo_experiments',
    name=exp_name,
    create_tensorboard_logger=True,
    create_checkpoint_callback=True,
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        save_top_k=1,
        always_save_nemo=True,
        save_best_model=True,
    ),
)


config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

start training

In [None]:
# Start training!!!
trainer.fit(asr_model)

Review training results in tensorboard

In [None]:
try:
  from google import colab
  COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
  COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
  %load_ext tensorboard
  %tensorboard --logdir /content/nemo_experiments/ASR-conformer-transd/ --port=6013
else:
  print("To use tensorboard, please use this notebook in a Google Colab environment.")

Optional - check currenly running processes on a given port. kill process using !kill PID

In [None]:
!lsof -i:6013

# !kill 39564

!lsof -i:6013

function to evaluate word error rate on model

In [None]:
from nemo.collections.asr.metrics.wer import word_error_rate

def predict_sentences(asr_model, test_set = val_dataset):

  raw_transcripts = []
  paths = []
  for i in range(600):
    raw_transcripts.append(test_set[i]['text'])
    paths.append(test_set[i]['audio']['path'])

  pred_transcripts = asr_model.transcribe(paths)
  if isinstance(pred_transcripts, tuple):
    pred_transcripts = pred_transcripts[0]
  wer = word_error_rate(pred_transcripts, raw_transcripts)

  return wer

predict_sentences(asr_model, val_dataset)

**Below is to predict WER for pre-trained only model**

In [None]:
from nemo.collections.asr.metrics.wer import word_error_rate

def predict_sentences(model_name, test_set = val_dataset):
  asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name) 

  raw_transcripts = []
  paths = []
  for i in range(600):
    raw_transcripts.append(test_set[i]['text'])
    paths.append(test_set[i]['audio']['path'])

  pred_transcripts = asr_model.transcribe(paths)
  if isinstance(pred_transcripts, tuple):
    pred_transcripts = pred_transcripts[0]
  wer = word_error_rate(pred_transcripts, raw_transcripts)

  return wer

predict_sentences('stt_en_conformer_transducer_xxlarge', val_dataset)

# **END!!!!**