In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

# Load the audio file
audio_path = "./vocals_only.wav"
y, sr = librosa.load(audio_path, sr=None)

# Parameters for voice activity detection
frame_length = 2048
hop_length = 512
energy_threshold = 0.01  # Adjust this threshold based on your audio

# Calculate short-time energy
energy = np.array(
    [sum(abs(y[i : i + frame_length] ** 2)) for i in range(0, len(y), hop_length)]
)

# Normalize energy
energy = energy / np.max(energy)

# Detect voice activity
voice_activity = energy > energy_threshold

# Find start and end times of voice activity
timestamps = []
start = None

for i, active in enumerate(voice_activity):
    if active and start is None:
        start = i * hop_length / sr
    elif not active and start is not None:
        end = i * hop_length / sr
        timestamps.append({"start": start, "end": end})
        start = None

# Handle case where audio ends with voice activity
if start is not None:
    end = len(y) / sr
    timestamps.append({"start": start, "end": end})

# Print the timestamps
print(timestamps)

# Plot the energy and detected voice activity
plt.figure(figsize=(14, 5))
plt.plot(energy, label="Energy")
plt.axhline(y=energy_threshold, color="r", linestyle="--", label="Threshold")
plt.fill_between(
    range(len(energy)),
    0,
    1,
    where=voice_activity,
    color="green",
    alpha=0.3,
    transform=plt.gca().get_xaxis_transform(),
    label="Voice Activity",
)
plt.xlabel("Frames")
plt.ylabel("Normalized Energy")
plt.title("Voice Activity Detection")
plt.legend()
plt.show()

: 