## Install Dependencies

In [40]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:02<00:00, 643kB/s] 


In [38]:
USE_PIP = True # download model using pip package or torch.hub
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
if USE_PIP:
  !pip install -q silero-vad
  from silero_vad import (load_silero_vad,
                          read_audio,
                          get_speech_timestamps,
                          save_audio,
                          VADIterator,
                          collect_chunks)
  model = load_silero_vad(onnx=USE_ONNX)
else:
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                model='silero_vad',
                                force_reload=True,
                                onnx=USE_ONNX)

  (get_speech_timestamps,
  save_audio,
  read_audio,
  VADIterator,
  collect_chunks) = utils

## Speech timestapms from full audio

In [49]:
wav = read_audio('../output_audio_mono.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)

[{'end': 53728, 'start': 10784},
 {'end': 77280, 'start': 65056},
 {'end': 125408, 'start': 80928},
 {'end': 181216, 'start': 136736},
 {'end': 203232, 'start': 184864},
 {'end': 228320, 'start': 210976},
 {'end': 250336, 'start': 232480},
 {'end': 273376, 'start': 258080},
 {'end': 284640, 'start': 275488},
 {'end': 312800, 'start': 287776},
 {'end': 323552, 'start': 316448},
 {'end': 343520, 'start': 329248}]


In [46]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')

## Entire audio inference

In [32]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# audio is being splitted into 31.25 ms long pieces
# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)
predicts = model.audio_forward(wav, sr=SAMPLING_RATE)

## Stream imitation example

In [33]:
## using VADIterator class

vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)
wav = read_audio(f'../output_audio_mono.wav', sampling_rate=SAMPLING_RATE)

window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 0.7} {'end': 3.4} {'start': 4.1} {'end': 4.8} {'start': 5.1} {'end': 7.8} {'start': 8.5} {'end': 11.3} {'start': 11.6} {'end': 12.7} {'start': 13.2} {'end': 14.3} {'start': 14.5} {'end': 15.6} {'start': 16.1} {'end': 17.1} {'start': 17.2} {'end': 17.8} {'start': 18.0} {'end': 19.6} {'start': 19.8} {'end': 20.2} {'start': 20.6} {'end': 21.5} 

In [34]:
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.46507999300956726, 0.738355815410614, 0.8762859106063843, 0.9573898911476135, 0.9656301736831665, 0.9954002499580383, 0.9969189167022705, 0.9968834519386292, 0.9967656135559082, 0.9967684745788574]
