In [1]:
import os
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from utils.utils import create_folder_if_not_exists


def format_timestamps_in_hh_mm_ss(line: dict[str, float]):
    start = line["start"]
    end = line["end"]
    start_hh_mm_ss = f"{int(start // 3600):02d}:{int((start % 3600) // 60):02d}:{int(start % 60):02d}"
    end_hh_mm_ss = (
        f"{int(end // 3600):02d}:{int((end % 3600) // 60):02d}:{int(end % 60):02d}"
    )
    return f"{start_hh_mm_ss} - {end_hh_mm_ss}"


vocals_only_path = "./vocals_only/test_1_vocals.wav"

timestamps_folder = "./silero_test/"
create_folder_if_not_exists(timestamps_folder)
timestamps_full_path = os.path.join(timestamps_folder, "silero.txt")

model = load_silero_vad()
wav = read_audio(vocals_only_path, sampling_rate=41000)
speech_timestamps = get_speech_timestamps(
    wav,
    model,
    return_seconds=True,  # Return speech timestamps in seconds (default is samples)
    progress_tracking_callback=lambda x: print(f"Progress: {x:.1f}%", end="\r"),
    threshold=0.175,
    # visualize_probs=False,
    # min_silence_duration_ms=750,
    # speech_pad_ms=110,
)


# Define function to merge consecutive timestamps
def merge_contiguous_timestamps(timestamps):
    merged_timestamps = []
    current_start = timestamps[0]["start"]
    current_end = timestamps[0]["end"]

    for timestamp in timestamps[1:]:
        if int(timestamp["start"]) <= (int(current_end) + 2.0):
            current_end = timestamp["end"]
        else:
            merged_timestamps.append({"start": current_start, "end": current_end})
            current_start = timestamp["start"]
            current_end = timestamp["end"]

    merged_timestamps.append({"start": current_start, "end": current_end})

    return merged_timestamps


merge_contiguous_timestamps(speech_timestamps)


with open(timestamps_full_path, "w") as f:
    for line in speech_timestamps:
        f.write(f"{format_timestamps_in_hh_mm_ss(line)}\n")

Progress: 100.0%