<a href="https://colab.research.google.com/github/insane74/TTS/blob/main/examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Live Colab Example


## Dependencies and Imports

In [1]:
#@title Install dependencies

!pip install -q omegaconf torchaudio pydub

import os
from os.path import exists

if not exists('silero-models'):
  !git clone -q --depth 1 https://github.com/snakers4/silero-models

%cd silero-models

# silero imports
import torch
import random
from glob import glob
from omegaconf import OmegaConf
from src.silero.utils import (init_jit_model,
                       split_into_batches,
                       read_audio,
                       read_batch,
                       prepare_model_input)
from colab_utils import (record_audio,
                         audio_bytes_to_np,
                         upload_audio)

device = torch.device('cpu')   # you can use any pytorch device
models = OmegaConf.load('models.yml')

# imports for uploading/recording
import numpy as np
import ipywidgets as widgets
from scipy.io import wavfile
from IPython.display import Audio, display, clear_output
from torchaudio.functional import vad


# wav to text method
def wav_to_text(f='test.wav'):
  batch = read_batch([f])
  input = prepare_model_input(batch, device=device)
  output = model(input)
  return decoder(output[0].cpu())

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
/content/silero-models


## Transcribe

In [None]:
#@markdown { run: "auto" }

language = "English" #@param ["English", "German", "Spanish"]

print(language)
if language == 'German':
  model, decoder = init_jit_model(models.stt_models.de.latest.jit, device=device)
elif language == "Spanish":
  model, decoder = init_jit_model(models.stt_models.es.latest.jit, device=device)
else:
  model, decoder = init_jit_model(models.stt_models.en.latest.jit, device=device)

In [None]:
#@markdown { run: "auto" }

use_VAD = "No" #@param ["Yes", "No"]

In [None]:
#@markdown Either record audio from microphone or upload audio from file (.mp3 or .wav) { run: "auto" }

record_or_upload = "Record" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   4#@param {type:"number", min:1, max:10, step:1}
sample_rate = 16000

def _apply_vad(audio, boot_time=0, trigger_level=9, **kwargs):
  print('\nVAD applied\n')
  vad_kwargs = dict(locals().copy(), **kwargs)
  vad_kwargs['sample_rate'] = sample_rate
  del vad_kwargs['kwargs'], vad_kwargs['audio']
  audio = vad(torch.flip(audio, ([0])), **vad_kwargs)
  return vad(torch.flip(audio, ([0])), **vad_kwargs)

def _recognize(audio):
  display(Audio(audio, rate=sample_rate, autoplay=True))
  if use_VAD == "Yes":
    audio = _apply_vad(audio)
  wavfile.write('test.wav', sample_rate, (32767*audio).numpy().astype(np.int16))
  transcription = wav_to_text()
  print('\n\nTRANSCRIPTION:\n')
  print(transcription)

def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds)
  wavfile.write('recorded.wav', sample_rate, (32767*audio).numpy().astype(np.int16))
  _recognize(audio)

def _upload_audio(b):
  clear_output()
  audio = upload_audio()
  _recognize(audio)
  return audio

if record_or_upload == "Record":
  button = widgets.Button(description="Record Speech")
  button.on_click(_record_audio)
  display(button)
else:
  audio = _upload_audio("")

In [None]:
#@markdown Check audio after applying VAD { run: "auto" }

if record_or_upload == "Record":
  audio = read_audio('recorded.wav', sample_rate)
display(Audio(_apply_vad(audio), rate=sample_rate, autoplay=True))

# PyTorch Example


In [3]:
#@title Install Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio omegaconf

import os
from os.path import exists

if not exists('silero-models'):
  !git clone -q --depth 1 https://github.com/snakers4/silero-models

%cd silero-models

import torch
import random
from glob import glob
from omegaconf import OmegaConf
from src.silero.utils import (init_jit_model,
                       split_into_batches,
                       read_batch,
                       prepare_model_input)
from IPython.display import display, Audio

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m112.6/117.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
/content/silero-models


## Minimal example

In [4]:
import torch
import zipfile
import torchaudio
from glob import glob

device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       jit_model='jit_xlarge',
                                       language='en', # also available 'de', 'es'
                                       device=device)
(read_batch, split_into_batches,
 read_audio, prepare_model_input) = utils  # see function signature for details

# download a single file, any format compatible with TorchAudio
torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',
                               dst ='speech_orig.wav', progress=True)
test_files = glob('speech_orig.wav')
batches = split_into_batches(test_files, batch_size=10)
input = prepare_model_input(read_batch(batches[0]),
                            device=device)

output = model(input)
for example in output:
    print(decoder(example.cpu()))

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master
100%|██████████| 528M/528M [00:30<00:00, 17.9MB/s]
100%|██████████| 0.99M/0.99M [00:00<00:00, 15.9MB/s]


the birch canoe slid on the smooth planks blew the sheet to the dark blue background it's easy to tell the depth of a well four hours of steady work faced us


## More examples

In [5]:
models = OmegaConf.load('models.yml')  # all available models are listed in the yml file
print(list(models.stt_models.keys()),
      list(models.stt_models.en.keys()),
      list(models.stt_models.en.latest.keys()),
      models.stt_models.en.latest.jit)
device = torch.device('cpu')   # you can use any pytorch device
model, decoder = init_jit_model(models.stt_models.en.latest.jit, device=device)

['en', 'de', 'es', 'ua'] ['latest', 'v6', 'v5', 'v4_0', 'v3', 'v2', 'v1'] ['meta', 'labels', 'jit', 'onnx', 'jit_q', 'jit_xlarge', 'onnx_xlarge'] https://models.silero.ai/models/en/en_v6.jit


100%|██████████| 112M/112M [00:07<00:00, 16.1MB/s]


In [6]:
device = torch.device('cpu')   # you can use any pytorch device
model, decoder = init_jit_model(models.stt_models.en.latest.jit, device=device)

In [7]:
test_files = glob('*.wav')  # replace with your data
batches = split_into_batches(test_files, batch_size=10)

In [8]:
# transcribe a set of files
input = prepare_model_input(read_batch(random.sample(batches, k=1)[0]),
                            device=device)
output = model(input)
for example in output:
    print(decoder(example.cpu()))

the boch canoe slit on the smooth planks blew the sheet to the dark blue background it's easy to tell a depth of a well four hours of steady work faced us


In [9]:
# listen to one file
batch = read_batch(random.sample(batches, k=1)[0])
input = prepare_model_input(batch,
                            device=device)
output = model(input)

for i, example in enumerate(output):
    print(decoder(example.cpu()))
    display(Audio(batch[i], rate=16000))  # audio was resampled to 16kHz
    break

the boch canoe slit on the smooth planks blew the sheet to the dark blue background it's easy to tell a depth of a well four hours of steady work faced us


In [10]:
# align example
batch = read_batch(random.sample(batches, k=1)[0])
input = prepare_model_input(batch,
                            device=device)

wav_len = input.shape[1] / 16000

output = model(input)

for i, example in enumerate(output):
    print(decoder(example.cpu(), wav_len, word_align=True)[-1])
    display(Audio(batch[i], rate=16000))  # audio was resampled to 16kHz
    break

[{'word': 'the', 'start_ts': 0.0, 'end_ts': 0.16}, {'word': 'boch', 'start_ts': 0.16, 'end_ts': 0.52}, {'word': 'canoe', 'start_ts': 0.52, 'end_ts': 0.95}, {'word': 'slit', 'start_ts': 0.95, 'end_ts': 1.35}, {'word': 'on', 'start_ts': 1.35, 'end_ts': 1.51}, {'word': 'the', 'start_ts': 1.51, 'end_ts': 1.67}, {'word': 'smooth', 'start_ts': 1.67, 'end_ts': 2.06}, {'word': 'planks', 'start_ts': 2.06, 'end_ts': 2.5}, {'word': 'blew', 'start_ts': 2.66, 'end_ts': 3.06}, {'word': 'the', 'start_ts': 3.06, 'end_ts': 3.26}, {'word': 'sheet', 'start_ts': 3.26, 'end_ts': 3.53}, {'word': 'to', 'start_ts': 3.53, 'end_ts': 3.77}, {'word': 'the', 'start_ts': 3.77, 'end_ts': 3.97}, {'word': 'dark', 'start_ts': 3.97, 'end_ts': 4.33}, {'word': 'blue', 'start_ts': 4.33, 'end_ts': 4.65}, {'word': 'background', 'start_ts': 4.65, 'end_ts': 5.36}, {'word': "it's", 'start_ts': 5.6, 'end_ts': 5.96}, {'word': 'easy', 'start_ts': 5.96, 'end_ts': 6.27}, {'word': 'to', 'start_ts': 6.27, 'end_ts': 6.43}, {'word': 'te

# ONNX Example

In [11]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio omegaconf onnx onnxruntime

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Minimal example

In [12]:
import onnx
import torch
import onnxruntime
from omegaconf import OmegaConf

language = 'en' # also available 'de', 'es'

# load provided utils
_, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_stt', language=language)
(read_batch, split_into_batches,
 read_audio, prepare_model_input) = utils

 # see available models
torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml', 'models.yml')
models = OmegaConf.load('models.yml')
available_languages = list(models.stt_models.keys())
assert language in available_languages

# load the actual ONNX model
torch.hub.download_url_to_file(models.stt_models.en.latest.onnx, 'model.onnx', progress=True)
onnx_model = onnx.load('model.onnx')
onnx.checker.check_model(onnx_model)
ort_session = onnxruntime.InferenceSession('model.onnx')

# download a single file, any format compatible with TorchAudio
torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav', dst ='speech_orig.wav', progress=True)
test_files = ['speech_orig.wav']
batches = split_into_batches(test_files, batch_size=10)
input = prepare_model_input(read_batch(batches[0]))

# actual onnx inference and decoding
onnx_input = input.detach().cpu().numpy()
ort_inputs = {'input': onnx_input}
ort_outs = ort_session.run(None, ort_inputs)
decoded = decoder(torch.Tensor(ort_outs[0])[0])
print(decoded)

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master
100%|██████████| 30.4k/30.4k [00:00<00:00, 41.2MB/s]
100%|██████████| 112M/112M [00:07<00:00, 15.8MB/s]
100%|██████████| 0.99M/0.99M [00:00<00:00, 17.0MB/s]


the boch canoeslid on the smooth planks blew the sheet to the dark blue background it's easy to tell a deps of a well four hours of steady work faced us
