## Install Dependencies

In [68]:
import torch
from silero_vad import load_silero_vad, read_audio, VADIterator
from tabulate import tabulate

# Constants
SAMPLING_RATE = 16000
torch.set_num_threads(1)

# Model setup
USE_ONNX = False
model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=True,
    onnx=USE_ONNX
)

# Unpack utilities from tuple
get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks = utils

print("Silero VAD model loaded successfully!")


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /Users/surya/.cache/torch/hub/master.zip


Silero VAD model loaded successfully!


In [69]:
def samples_to_seconds(samples):
    seconds = samples / SAMPLING_RATE
    return f"{seconds:.3f}"

## Speech timestapms from full audio

In [70]:
wav = read_audio('output_mono.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
# pprint(speech_timestamps)

In [71]:
table_data = []
for timestamp in speech_timestamps:
    start_sec = samples_to_seconds(timestamp['start'])
    end_sec = samples_to_seconds(timestamp['end'])
    duration = float(end_sec) - float(start_sec)
    table_data.append([start_sec, end_sec, f"{duration:.3f}"])

# Print table
print(tabulate(table_data, headers=["Start Time (s)", "End Time (s)", "Duration (s)"], tablefmt="fancy_grid"))

╒══════════════════╤════════════════╤════════════════╕
│   Start Time (s) │   End Time (s) │   Duration (s) │
╞══════════════════╪════════════════╪════════════════╡
│            3.426 │          6.078 │          2.652 │
├──────────────────┼────────────────┼────────────────┤
│            6.21  │         21.118 │         14.908 │
├──────────────────┼────────────────┼────────────────┤
│           21.25  │         24.83  │          3.58  │
├──────────────────┼────────────────┼────────────────┤
│           25.026 │         40.286 │         15.26  │
├──────────────────┼────────────────┼────────────────┤
│           40.642 │         44.478 │          3.836 │
├──────────────────┼────────────────┼────────────────┤
│           45.282 │         47.166 │          1.884 │
├──────────────────┼────────────────┼────────────────┤
│           47.618 │         56.382 │          8.764 │
├──────────────────┼────────────────┼────────────────┤
│           57.794 │         60.35  │          2.556 │
├─────────

### Optional

## AUdio only speech

In [72]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')

## Entire audio inference

In [73]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# audio is being splitted into 31.25 ms long pieces
# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)
predicts = model.audio_forward(wav, sr=SAMPLING_RATE)

## Stream imitation example

In [74]:
## using VADIterator class

# vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)
# wav = read_audio(f'../output_audio_mono.wav', sampling_rate=SAMPLING_RATE)

# window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
# for i in range(0, len(wav), window_size_samples):
#     chunk = wav[i: i+ window_size_samples]
#     if len(chunk) < window_size_samples:
#       break
#     speech_dict = vad_iterator(chunk, return_seconds=True)
#     if speech_dict:
#         print(speech_dict, end=' ')
# vad_iterator.reset_states() # reset model states after each audio

In [75]:
# ## just probabilities

# wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# speech_probs = []
# window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
# for i in range(0, len(wav), window_size_samples):
#     chunk = wav[i: i+ window_size_samples]
#     if len(chunk) < window_size_samples:
#       break
#     speech_prob = model(chunk, SAMPLING_RATE).item()
#     speech_probs.append(speech_prob)
# vad_iterator.reset_states() # reset model states after each audio

# print(speech_probs[:10]) # first 10 chunks predicts