In [None]:
# Traditional energy threshold approach

import librosa
import numpy as np
import matplotlib.pyplot as plt

audio_file_path = "E:\\5305 assignment\\audio\\AlGore_2009.wav"

# Define frame length and hop length
frame_length = int(0.025 * 22050)
hop_length = int(0.010 * 22050)

# Preprocess the audio file
def preprocess_audio(audio_path):
    # Load the audio file
    audio_data, sample_rate = librosa.load(audio_path, sr=None)
    # Frame the audio
    frames = librosa.util.frame(audio_data, frame_length=frame_length, hop_length=hop_length).T
    return frames

# Calculate the energy of an audio frame
def frame_energy(frame):
    return np.sum(frame ** 2) / len(frame)

# Voice Activity Detection decision based on energy threshold
def is_speech(frame, threshold):
    return frame_energy(frame) > threshold

# Calculate the energy threshold using the statistics of frame energies
def calculate_energy_threshold(frames, factor=0.01):
    energies = [frame_energy(frame) for frame in frames]
    mean_energy = np.mean(energies)
    std_energy = np.std(energies)
    return mean_energy + factor * std_energy

# Preprocess the single audio file
frames = preprocess_audio(audio_file_path)
print(f"Processed {audio_file_path}")

# Calculate the adaptive energy threshold for the audio file
energy_threshold = calculate_energy_threshold(frames)
speech_flags = [is_speech(frame, energy_threshold) for frame in frames]
num_speech_frames = sum(speech_flags)
num_total_frames = len(frames)

print(f"VAD applied on {audio_file_path}")
print(f"Detected {num_speech_frames} speech frames out of {num_total_frames}")

# Plotting the energy of the frames
energies = [frame_energy(frame) for frame in frames]
time = np.arange(len(energies)) * (hop_length / 22050)  # Convert frame index to seconds

plt.figure(figsize=(15, 5))
plt.plot(time, energies, label='Frame Energy')
plt.axhline(y=energy_threshold, color='r', linestyle='--', label='Energy Threshold')
plt.legend()
plt.xlabel('Time (s)')
plt.ylabel('Energy')
plt.title('Frame Energy over Time')
plt.show()

In [None]:
# Adaptive threshold approach

audio_file_path = "E:\\5305 assignment\\audio\\AlGore_2009.wav"

frame_length = int(0.025 * 22050)
hop_length = int(0.010 * 22050)

def preprocess_audio(audio_path):
    # Load the audio file
    audio_data, sample_rate = librosa.load(audio_path, sr=None)
    # Frame the audio
    frames = librosa.util.frame(audio_data, frame_length=frame_length, hop_length=hop_length).T
    return frames, sample_rate

# Calculate the energy of an audio frame
def frame_energy(frame):
    return np.sum(frame ** 2) / len(frame)

# Calculate the energy threshold using the statistics of frame energies
def calculate_energy_threshold(frames, noise_percentile=1, speech_percentile=96):
    # Compute the short-term energy of each frame
    frame_energies = np.array([frame_energy(frame) for frame in frames])

    # Estimate the noise energy level using a low percentile
    noise_energy = np.percentile(frame_energies, noise_percentile)

    # Estimate speech energy level using a high percentile
    speech_energy = np.percentile(frame_energies, speech_percentile)

    # Use a value between noise and speech energy as the threshold
    threshold = np.sqrt(noise_energy * speech_energy)

    return threshold, frame_energies

# Determine if a frame contains speech
def is_speech(frame, threshold):
    return frame_energy(frame) > threshold

frames, sample_rate = preprocess_audio(audio_file_path)
print(f"Processed {audio_file_path}")

energy_threshold, frame_energies = calculate_energy_threshold(frames)
speech_flags = [is_speech(frame, energy_threshold) for frame in frames]
num_speech_frames = sum(speech_flags)
num_total_frames = len(frames)

print(f"VAD applied on {audio_file_path}")
print(f"Detected {num_speech_frames} speech frames out of {num_total_frames}, which is {num_speech_frames/num_total_frames:.2%} of the total frames.")

# Plot the energy threshold graph
plt.figure(figsize=(14, 5))
plt.plot(frame_energies, label='Frame energy')
plt.axhline(energy_threshold, color='red', linestyle='--', label='Energy threshold')
plt.xlabel('Frame index')
plt.ylabel('Energy')
plt.legend()
plt.title('Frame Energies with Energy Threshold')
plt.show()

# Plot the speech activity graph
plt.figure(figsize=(14, 5))
plt.plot(speech_flags, label='Speech activity (1=Speech, 0=Non-speech)')
plt.xlabel('Frame index')
plt.ylabel('Speech Flag')
plt.title('Speech Activity Detection')
plt.show()