In [1]:
import librosa
import numpy as np
import IPython.display as ipd
import glob
from pydub import AudioSegment



In [2]:
PATH_TO_AUDIO = '/raid/kaisar_dauletbek/datasets/Voice_Conversion/voice_male/'
audios_list = glob.glob(PATH_TO_AUDIO + '*.wav')
long_audios = [audio for audio in audios_list if librosa.get_duration(path=audio) > 30]

In [3]:
def get_pause_times(audio_file):
    y, sr = librosa.load(audio_file)

    # Calculate the short-time energy
    frame_length = int(0.02 * sr)  # 20 ms frames
    hop_length = int(0.01 * sr)  # 10 ms hop
    energy = np.array([np.sum(y[i:i + frame_length] ** 2) for i in range(0, len(y), hop_length)])

    # Set a threshold for pause detection
    threshold = np.percentile(energy, 0.1)

    # Detect pauses
    pauses = np.where(energy < threshold)[0]

    # Convert frame indices to time
    pause_times = librosa.frames_to_time(pauses, sr=sr, hop_length=hop_length)

    return y, sr, pause_times


In [4]:
y, sr, pause_times = get_pause_times(long_audios[0])

In [6]:
pause_times

array([0.00000000e+00, 9.97732426e-03, 1.99546485e-02, 2.99319728e-02,
       3.99092971e-02, 4.98866213e-02, 5.98639456e-02, 6.98412698e-02,
       7.98185941e-02, 8.97959184e-02, 9.97732426e-02, 6.47528345e+00,
       6.48526077e+00, 6.50521542e+00, 6.56507937e+00, 6.81451247e+00,
       1.74433560e+02, 2.16328345e+02, 2.16338322e+02, 2.16348299e+02,
       2.37101134e+02, 2.54651247e+02, 2.60597732e+02, 2.63710658e+02,
       2.78307483e+02, 4.12721995e+02, 4.12731973e+02, 4.13949206e+02,
       4.13959184e+02, 4.19935601e+02, 4.31609070e+02, 4.34282993e+02,
       4.34292971e+02, 5.58191383e+02, 6.93394104e+02, 6.93434014e+02,
       6.93503855e+02, 6.99649887e+02, 6.99659864e+02, 6.99679819e+02,
       6.99689796e+02, 7.00098866e+02, 7.00108844e+02, 7.00118821e+02,
       7.00138776e+02, 7.00148753e+02, 7.03241723e+02, 7.03251701e+02,
       7.03261678e+02, 7.08938776e+02, 7.08978685e+02, 7.08988662e+02,
       7.23405896e+02, 7.29701587e+02, 7.41764172e+02, 7.47341497e+02,
      

In [5]:
# Convert the audio data to int16 format
y_int16 = (y * 32768).astype('int16')

# Create an AudioSegment object
audio_segment = AudioSegment(y_int16.tobytes(), frame_rate=sr, sample_width=y_int16.dtype.itemsize, channels=1)

# Split the audio based on pause times
split_audio = []
start_time = 0

for pause_time in pause_times:
    end_time = int(pause_time * 1000)  # Convert to milliseconds
    split_audio.append(audio_segment[start_time:end_time])
    start_time = end_time

# Add the last segment
split_audio.append(audio_segment[start_time:])

# Save the split audio segments
for i, segment in enumerate(split_audio):
    segment.export(f'split_audio_{i}.wav', format='wav')

In [7]:
librosa.get_duration(path=long_audios[0])

1355.8595918367346

In [120]:
ipd.Audio(long_audios[0])

In [97]:
pause_times = {}
for audio_file in long_audios:
    pause_times[audio_file] = get_pause_times(audio_file)

In [98]:
pause_times

{'/raid/kaisar_dauletbek/datasets/PRAAT/m_009_2/m_009_2_416.wav': array([0.00000000e+00, 9.97732426e-03, 1.99546485e-02, 2.99319728e-02,
        3.99092971e-02, 4.98866213e-02, 5.98639456e-02, 6.98412698e-02,
        7.98185941e-02, 8.97959184e-02, 9.97732426e-02, 2.49433107e+00,
        2.50430839e+00, 2.51428571e+00, 2.52426304e+00, 2.53424036e+00,
        2.54421769e+00, 2.55419501e+00, 2.57414966e+00, 2.58412698e+00,
        2.59410431e+00, 2.60408163e+00, 2.61405896e+00, 2.62403628e+00,
        2.65396825e+00, 2.66394558e+00, 2.67392290e+00, 2.68390023e+00,
        2.69387755e+00, 2.70385488e+00, 2.71383220e+00, 2.72380952e+00,
        2.73378685e+00, 2.74376417e+00, 2.75374150e+00, 2.94331066e+00,
        2.95328798e+00, 2.96326531e+00, 2.97324263e+00, 5.07845805e+00,
        5.11836735e+00, 5.18820862e+00, 5.19818594e+00, 5.21814059e+00,
        5.23809524e+00, 5.24807256e+00, 5.25804989e+00, 5.26802721e+00,
        5.27800454e+00, 5.28798186e+00, 5.29795918e+00, 5.30793651e+00,

In [4]:
import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
# torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_timestamps,
 _, read_audio,
 *_) = utils

sampling_rate = 16000 # also accepts 8000
wav = read_audio('en_example.wav', sampling_rate=sampling_rate)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)
pprint(speech_timestamps)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/kaisar_dauletbek/.cache/torch/hub/master.zip


RuntimeError: Failed to load audio from en_example.wav