# Import the required modules:
[Nvidia - NeMo docs](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/intro.html)  
https://github.com/NVIDIA/NeMo

In [3]:
import torch
import nemo.collections.asr as nemo_asr

from nemo.collections.asr.metrics.wer import WER
from nemo.collections.asr.modules.audio_preprocessing import AudioToMelSpectrogramPreprocessor
from nemo.collections.asr.parts.features import WaveformFeaturizer
#from nemo.collections.asr.modules import CTCDecoder

import soundfile as sf

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
# Load the pre-trained model
model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

# Load the audio file
#file_path = "path/to/your/audio.wav"
file_path = r"C:\scratch\nemotest\hello_world.wav"

#Read the audio file
samples, sample_rate = sf.read(file_path)

[NeMo I 2023-05-09 17:55:05 cloud:58] Found existing object C:\Users\blaze\.cache\torch\NeMo\NeMo_1.17.0\QuartzNet15x5Base-En\2b066be39e9294d7100fb176ec817722\QuartzNet15x5Base-En.nemo.
[NeMo I 2023-05-09 17:55:05 cloud:64] Re-using file from: C:\Users\blaze\.cache\torch\NeMo\NeMo_1.17.0\QuartzNet15x5Base-En\2b066be39e9294d7100fb176ec817722\QuartzNet15x5Base-En.nemo
[NeMo I 2023-05-09 17:55:05 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-05-09 17:55:06 features:287] PADDING: 16
[NeMo I 2023-05-09 17:55:07 save_restore_connector:247] Model EncDecCTCModel was successfully restored from C:\Users\blaze\.cache\torch\NeMo\NeMo_1.17.0\QuartzNet15x5Base-En\2b066be39e9294d7100fb176ec817722\QuartzNet15x5Base-En.nemo.


In [6]:
samples.size

188075

In [7]:
sample_rate

16000

In [8]:
samples_length = torch.tensor(samples.shape[0]).unsqueeze(0)
samples_length

tensor([188075])

In [9]:
samples_tensor = torch.from_numpy(samples)
samples_tensor

tensor([ 0.0000e+00, -3.0518e-05, -3.0518e-05,  ...,  0.0000e+00,
         0.0000e+00,  0.0000e+00], dtype=torch.float64)

In [10]:
samples_2d = samples_tensor.unsqueeze(0)
samples_2d

tensor([[ 0.0000e+00, -3.0518e-05, -3.0518e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], dtype=torch.float64)

In [11]:
samples_2d.size

<function Tensor.size>

In [12]:
# Instantiate the preprocessor
preprocessor = AudioToMelSpectrogramPreprocessor(
    sample_rate=sample_rate,
    window_size=0.02,
    window_stride=0.01,
    n_fft=512,
    #num_mels=64,
    preemph=0.97,
    dither=1e-5,
)

[NeMo I 2023-05-09 17:55:14 features:287] PADDING: 16


In [13]:
preprocessor

AudioToMelSpectrogramPreprocessor(
  (featurizer): FilterbankFeatures()
)

In [14]:
# Convert the waveform to log-mel spectrograms
mel_spectrogram = preprocessor(input_signal=samples_2d, length=samples_length)

In [15]:
mel_spectrogram

(tensor([[[-2.4551, -2.4571, -0.2572,  ...,  0.0000,  0.0000,  0.0000],
          [-2.0071, -2.0077, -0.8349,  ...,  0.0000,  0.0000,  0.0000],
          [-1.8188, -1.8189, -1.2041,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-2.1905, -2.3248,  0.9957,  ...,  0.0000,  0.0000,  0.0000],
          [-1.8807, -2.0942,  0.7208,  ...,  0.0000,  0.0000,  0.0000],
          [-1.5767, -1.6665,  0.0897,  ...,  0.0000,  0.0000,  0.0000]]],
        dtype=torch.float64),
 tensor([1176]))

In [16]:
# Prepare the input tensor
#input_tensor = mel_spectrogram[0]squeeze(0).unsqueeze(0).to(device)
input_tensor = mel_spectrogram[0].to(torch.float32).to(device)
input_tensor

tensor([[[-2.4551, -2.4571, -0.2572,  ...,  0.0000,  0.0000,  0.0000],
         [-2.0071, -2.0077, -0.8349,  ...,  0.0000,  0.0000,  0.0000],
         [-1.8188, -1.8189, -1.2041,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-2.1905, -2.3248,  0.9957,  ...,  0.0000,  0.0000,  0.0000],
         [-1.8807, -2.0942,  0.7208,  ...,  0.0000,  0.0000,  0.0000],
         [-1.5767, -1.6665,  0.0897,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0')

In [17]:
mel_spectrogram[1]

tensor([1176])

In [18]:
int(mel_spectrogram[1])

1176

In [19]:
# Perform ASR
log_probs, sequence_length, greedy_token_predictions = model.forward(
    processed_signal=input_tensor, 
    processed_signal_length=mel_spectrogram[1]
)

In [20]:
log_probs

tensor([[[-5.9711e+00, -7.9439e+00, -7.7328e+00,  ..., -9.4916e+00,
          -8.6795e+00, -1.0398e-02],
         [-6.3059e+00, -8.7399e+00, -8.3855e+00,  ..., -1.0523e+01,
          -8.5935e+00, -5.4330e-03],
         [-7.0969e+00, -9.0191e+00, -8.6317e+00,  ..., -1.0379e+01,
          -9.6912e+00, -3.8902e-03],
         ...,
         [-2.3329e+00, -3.2861e+00, -3.4251e+00,  ..., -4.2241e+00,
          -3.4887e+00, -2.7281e+00],
         [-2.3329e+00, -3.2861e+00, -3.4251e+00,  ..., -4.2241e+00,
          -3.4887e+00, -2.7281e+00],
         [-2.3329e+00, -3.2861e+00, -3.4251e+00,  ..., -4.2241e+00,
          -3.4887e+00, -2.7281e+00]]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)

In [21]:
sequence_length

tensor([588])

In [22]:
greedy_token_predictions

tensor([[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
         28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
         28, 28, 28, 28, 28, 28, 28, 28, 28,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0, 28,  0, 28,  0, 28,  0, 28,  0,  0,  0,  0,  0,
          0,  0,  0, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,  5,
         18,  5,  5,  5, 28, 28, 18, 28, 28, 28,  0, 28, 28, 28, 28, 28, 28, 28,
          0, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
         28, 28, 28,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 28,  0, 28, 28,
         28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,  0, 28,  0, 28,
          0, 28,  0,  0,  0, 28, 28, 28,  0,  0,  0, 28, 28, 28, 28, 28, 28, 28,
          0, 28, 28, 28, 28, 28, 28, 28,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,

In [23]:
log_probs_soft_max = torch.nn.functional.softmax(log_probs, dim=-1)
log_probs_soft_max

tensor([[[2.5514e-03, 3.5483e-04, 4.3821e-04,  ..., 7.5487e-05,
          1.7004e-04, 9.8966e-01],
         [1.8256e-03, 1.6006e-04, 2.2814e-04,  ..., 2.6899e-05,
          1.8531e-04, 9.9458e-01],
         [8.2764e-04, 1.2108e-04, 1.7836e-04,  ..., 3.1081e-05,
          6.1823e-05, 9.9612e-01],
         ...,
         [9.7019e-02, 3.7400e-02, 3.2545e-02,  ..., 1.4639e-02,
          3.0540e-02, 6.5344e-02],
         [9.7019e-02, 3.7400e-02, 3.2545e-02,  ..., 1.4639e-02,
          3.0540e-02, 6.5344e-02],
         [9.7019e-02, 3.7400e-02, 3.2545e-02,  ..., 1.4639e-02,
          3.0540e-02, 6.5344e-02]]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

In [24]:
# Define the CTC decoding object
decoder = CTCDecoding(blank_index=model.decoder.blank_index)

NameError: name 'CTCDecoding' is not defined

In [None]:
output = model.decoder(log_probs=log_probs_soft_max).argmax(dim=-1)

In [None]:
output = model._wer.ctc_decoder_predictions_tensor(log_probs_soft_max)

In [None]:
transcription = ''.join(model._wer.get_tokenizer().ids_to_text(output[0].cpu().detach().numpy().tolist()))
print("Transcription:", transcription)

## Example from nvidia library

In [26]:
# Here is an example of all CTC-based models:
#nemo_asr.models.EncDecCTCModel.list_available_models()
# More ASR Models are available - see: nemo_asr.models.ASRModel.list_available_models()

In [27]:
# Speech Recognition model - Citrinet initially trained on Multilingual LibriSpeech English corpus, and fine-tuned on the open source Aishell-2
asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En").cuda()

[NeMo I 2023-05-09 17:56:05 cloud:58] Found existing object C:\Users\blaze\.cache\torch\NeMo\NeMo_1.17.0\QuartzNet15x5Base-En\2b066be39e9294d7100fb176ec817722\QuartzNet15x5Base-En.nemo.
[NeMo I 2023-05-09 17:56:05 cloud:64] Re-using file from: C:\Users\blaze\.cache\torch\NeMo\NeMo_1.17.0\QuartzNet15x5Base-En\2b066be39e9294d7100fb176ec817722\QuartzNet15x5Base-En.nemo
[NeMo I 2023-05-09 17:56:05 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-05-09 17:56:05 features:287] PADDING: 16
[NeMo I 2023-05-09 17:56:06 save_restore_connector:247] Model EncDecCTCModel was successfully restored from C:\Users\blaze\.cache\torch\NeMo\NeMo_1.17.0\QuartzNet15x5Base-En\2b066be39e9294d7100fb176ec817722\QuartzNet15x5Base-En.nemo.


In [28]:
# IMPORTANT: The audio must be mono with 16Khz sampling rate
audio_sample = r'C:\scratch\nemotest\hello_world.wav'

In [29]:
transcribed_text = asr_model.transcribe([audio_sample])
#print(transcribed_text)

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
transcribed_text

['this is a recording we are testing a python library called nemo hello world']