<a href="https://colab.research.google.com/github/hgse-schneider/mmla-gse-colab-notebooks/blob/main/Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! ls

sample_data


#Sources


*   Github repo: https://github.com/snakers4/silero-vad
*   Examples: https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#examples

## Install Speech Detector

In [None]:
# this assumes that you have a proper version of PyTorch already installed
! pip install -q torchaudio soundfile

In [None]:
# setup
import torch
torch.set_num_threads(1)

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True, onnx=False)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to /root/.cache/torch/hub/master.zip


#Process data

In [None]:
# parameters
SAMPLING_RATE = 16000

In [None]:
def process_audio_file(filename): 
  # get the speech data 
  wav = read_audio(filename, sampling_rate=SAMPLING_RATE)
  speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True)
  return speech_timestamps

In [None]:
# download a test file
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

  0%|          | 0.00/1.83M [00:00<?, ?B/s]

In [None]:
filename = 'en_example.wav'
speech_timestamps = process_audio_file(filename)
print(speech_timestamps)

[{'start': 0, 'end': 32736}, {'start': 43008, 'end': 74208}, {'start': 79872, 'end': 109536}, {'start': 148992, 'end': 212448}, {'start': 216576, 'end': 253920}, {'start': 259584, 'end': 286176}, {'start': 293376, 'end': 312288}, {'start': 325632, 'end': 602592}, {'start': 606720, 'end': 622560}, {'start': 638976, 'end': 693216}, {'start': 697344, 'end': 714720}, {'start': 720384, 'end': 750048}, {'start': 758784, 'end': 763872}, {'start': 781824, 'end': 799200}, {'start': 817152, 'end': 866784}, {'start': 872448, 'end': 954336}, {'start': 958464, 'end': 960000}]


#Check Results

###Before

In [None]:
from IPython.display import Audio 
Audio('en_example.wav')

###After

In [None]:
# read the audio file
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)

# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)

# merge all speech chunks to one audio
save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)

# display an audio reader
Audio('only_speech.wav')

#Batch Processing

## From Google Drive

In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# iterate through the audio file of a given folder
import os

def extract_speech(folder, overwrite=False): 

  # go through each wave file of the folder
  for file in os.listdir(folder): 

    # skip if not wav
    if not file.endswith('.wav'): continue
    wave_file = os.path.join(folder, file)
    output = wave_file.replace('.wav', '.csv')

    # skip if the csv file already exists
    if os.path.isfile(output) and not overwrite: continue

    # print which file is being processed
    print('Processing:', wave_file, end='')
    start_ends = ""

    # clean up the output
    for speech_dict in speech_timestamps:
      for key,val in speech_dict.items():
        string = str(val)
        if key == 'start': string += ','
        elif key == 'end': string += '\n'
        start_ends += str(string)

    # save the results
    with open(output, "w") as f: f.write(start_ends)
    print(" -> Done!")