# Audio Preprocessing

This notebook involves pre-processing the human health audio data, directly from VocalSound, to create audio for the HeAR model to form embeddings, cluster, and visualize the sounds. The code to download the VocalSound data can be found in the Old Notebooks/VocalSound.ipynb notebook. Here, we assume that the data has been downloaded locally.

**Note**: The code for generating spectrograms was inspired by https://github.com/kylemcdonald/AudioNotebooks/blob/master/Generating%20Spectrograms.ipynb. All credit to Kyle McDonald.

In [14]:
# @title Import Statements

import librosa
import numpy as np
import soundfile as sf
import os
import random
from glob import glob
from IPython.display import Audio, display, Markdown
import librosa.display
import matplotlib.pyplot as plt

### Clean up VocalSound data

The VocalSound dataset is messy, providing many sounds that are not interpretable, both by humans and the HeAR embedding model. Here we iterate through all clips, trim any silence, and remove those that are too short or too quiet. Additionally, we cap the length of audio clips to 1 second, so that they can be played one after another in our visualization. In the end, we save our processed audio to audio/audio_processed

In [2]:
def trim_with_librosa(input_filepath, output_filepath, max_duration=1.0, top_db_threshold=40, min_duration=0.2, rms_threshold=-50):
    # Load the audio file, uses mono (1 channel) and a fixed sample rate (16kHz)
    audio, sr = librosa.load(input_filepath, sr=16000, mono=True)

    original_duration = len(audio) / sr

    # Ignores clips that are too short
    if original_duration < min_duration:
        print(f"WARNING: Audio file {input_filepath} too short ({original_duration:.2f}s). Skipping...")
        return

    # Silence Filtering
    mean_squared_amplitude = np.mean(audio**2)

    if mean_squared_amplitude == 0:
        # Handle entirely silent clips
        rms_loudness = rms_threshold - 10
    else:
        rms_loudness = round(20 * np.log10(np.sqrt(mean_squared_amplitude)))

    if rms_loudness < rms_threshold:
        print(f" WARNING: Audio file {input_filepath} was too quiet ({rms_loudness}). Skipping...")
        return

    # Silence trimming
    # This removes leading and trailing silence that is quieter than 'top_db_threshold'
    core_segment, _ = librosa.effects.trim(audio, top_db=top_db_threshold)
    core_duration = len(core_segment) / sr

    # Dynamic Trimming Logic (Highest energy segment if over max_duration)
    if core_duration > max_duration:

        # If too long, apply the highest energy trim to the core segment
        max_samples = int(max_duration * sr)
        hop_length = int(0.05 * sr) # 50ms hop length

        # Calculate RMS frames for the core segment
        core_rms = librosa.feature.rms(y=core_segment, frame_length=2048, hop_length=hop_length, center=False)[0]

        # Find the loudest max_duration window within the core
        max_energy_sum = -1
        best_start_sample_in_core = 0

        for i in range(len(core_rms)):
            current_start_sample = librosa.frames_to_samples(i, hop_length=hop_length)

            # Check if the max_duration segment fits
            if current_start_sample + max_samples > len(core_segment):
                break

            # Use the core segment for energy calculation
            segment = core_segment[current_start_sample : current_start_sample + max_samples]
            current_energy_sum = np.sum(segment**2)

            if current_energy_sum > max_energy_sum:
                max_energy_sum = current_energy_sum
                best_start_sample_in_core = current_start_sample

        final_trimmed_segment = core_segment[best_start_sample_in_core : best_start_sample_in_core + max_samples]
        result_msg = f"Trimmed Core ({core_duration:.3f}s) -> MAX DURATION ({max_duration}s)."

    else:
        # If short enough, save the entire core segment
        final_trimmed_segment = core_segment
        result_msg = f"Kept natural duration: {core_duration:.3f}s (less than {max_duration}s max)."

    # Save the final audio segment with normalization
    max_abs = np.max(np.abs(final_trimmed_segment))
    if max_abs > 0:
        # Normalize to just below 1.0 to prevent clipping
        final_trimmed_segment = final_trimmed_segment / max_abs * 0.95

    sf.write(output_filepath, final_trimmed_segment, sr)
    print(result_msg)

In [4]:
SOURCE_DIR = "vs_release_16k/audio_16k"
OUTPUT_DIR = "audio/audio_processed"
MAX_DURATION = 1.0

# Higher value = more aggressive removal of quiet sounds
TOP_DB_THRESHOLD = 40
# Minimum length of active sound required to save the clip.
MIN_ACTIVE_DURATION = 0.2
# Minimum loudness for clip to not be silenced
SILENCE_RMS_THRESHOLD_DB = -50

print("--- Starting Librosa-based Dynamic Audio Trimming and Filtering Process ---")

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Source Directory: {SOURCE_DIR}")
print(f"Output Directory: {OUTPUT_DIR}")
print(f"Maximum Duration: {MAX_DURATION} seconds")
print(f"Silence Threshold (top_db): {TOP_DB_THRESHOLD} dB")
print(f"Minimum Active Duration to Keep: {MIN_ACTIVE_DURATION} seconds")


# Process all .wav files in the source directory
for filename in os.listdir(SOURCE_DIR):
    if filename.endswith((".wav")):
        print(f"\nProcessing: {filename}")

        input_file = os.path.join(SOURCE_DIR, filename)
        output_file = os.path.join(OUTPUT_DIR, filename)

        # Run the Librosa trimming and filtering function
        trim_with_librosa(input_file, output_file, MAX_DURATION, TOP_DB_THRESHOLD, MIN_ACTIVE_DURATION, SILENCE_RMS_THRESHOLD_DB)

print("\n--- Process Complete ---")

--- Starting Librosa-based Dynamic Audio Trimming and Filtering Process ---
Source Directory: vs_release_16k/audio_16k
Output Directory: audio/audio_processed
Maximum Duration: 1.0 seconds
Silence Threshold (top_db): 40 dB
Minimum Active Duration to Keep: 0.2 seconds

Processing: f1236_0_sigh.wav
Trimmed Core (1.472s) -> MAX DURATION (1.0s).

Processing: f1541_0_throatclearing.wav
Trimmed Core (2.336s) -> MAX DURATION (1.0s).

Processing: m1679_0_sniff.wav
Trimmed Core (3.072s) -> MAX DURATION (1.0s).

Processing: m1783_0_throatclearing.wav
Trimmed Core (1.856s) -> MAX DURATION (1.0s).

Processing: f1291_0_laughter.wav
Trimmed Core (2.136s) -> MAX DURATION (1.0s).

Processing: m0209_0_sneeze.wav
Kept natural duration: 0.608s (less than 1.0s max).

Processing: f1284_0_laughter.wav
Trimmed Core (1.856s) -> MAX DURATION (1.0s).

Processing: m1037_0_sigh.wav
Trimmed Core (1.216s) -> MAX DURATION (1.0s).

Processing: m0019_0_cough.wav
Trimmed Core (2.475s) -> MAX DURATION (1.0s).

Processin

#### Display Sample Audio

Audio should be trimmed, of length [0.2, 1], and not silent

In [13]:
AUDIO_DIR = "audio/audio_processed"
N_SAMPLES = 25 # The target number of files to sample

# File gathering
all_audio_paths = [
    f for f in glob(os.path.join(AUDIO_DIR, '*.*'))
    if os.path.isfile(f) and f.lower().endswith(('.wav'))
]

num_total_files = len(all_audio_paths)
sampled_files = random.sample(all_audio_paths, N_SAMPLES)

for i, file_path in enumerate(sampled_files):
    filename = os.path.basename(file_path)

    # Display the filename as a header
    print(f"{i+1}. {filename}")

    # Display the in-line audio player widget
    display(Audio(filename=file_path))

    display(Markdown("---"))


1. f2872_0_cough.wav


---

2. m0023_0_sigh.wav


---

3. f3064_0_laughter.wav


---

4. f2462_0_sigh.wav


---

5. f0061_0_laughter.wav


---

6. m2190_0_sigh.wav


---

7. m3224_0_sniff.wav


---

8. f1480_0_sigh.wav


---

9. m1169_0_throatclearing.wav


---

10. m2727_0_sigh.wav


---

11. m2797_0_sniff.wav


---

12. m3302_0_cough.wav


---

13. m1664_0_laughter.wav


---

14. f0737_0_sniff.wav


---

15. m2790_0_laughter.wav


---

16. m1150_0_sigh.wav


---

17. f1344_0_sigh.wav


---

18. m1920_0_sneeze.wav


---

19. f1803_0_sigh.wav


---

20. f2459_0_laughter.wav


---

21. f3215_0_sigh.wav


---

22. m2341_0_sneeze.wav


---

23. m1823_0_cough.wav


---

24. m2343_0_sniff.wav


---

25. f0689_0_sigh.wav


---

### Create Spectrograms

For our visualization, we want to display each audio clip to the user. We follow a similar style to the Bird Sounds Visualization, using a spectrogram. To make the images look like thin, dark lines on a white background, we aggressively limit the dynamic range.

In [16]:
def create_sparse_spectrogram_image(audio_filepath, output_image_filepath, sr=16000, n_fft=2048, hop_length=256, n_mels=128, db_range_min=-30, figsize=(2, 2), dpi=100):
    audio, sr = librosa.load(audio_filepath, sr=sr, mono=True)

    # Compute Mel Spectrogram
    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    # Convert to log-amplitude (dB). Use np.max(S) as the reference (0 dB)
    S_db = librosa.power_to_db(S, ref=np.max)

    # Aggressive Dynamic Range Compression
    # VMAX is always 0 (loudest point in the clip).
    # VMIN determines what is considered background (white)
    VMAX = 0
    VMIN = db_range_min

    plt.figure(figsize=figsize, dpi=dpi)

    # Use 'gray' colormap (dark patterns on light background) and set the strict VMIN/VMAX range
    librosa.display.specshow(S_db, sr=sr, hop_length=hop_length,
                             x_axis=None, y_axis=None,
                             cmap='Greys',
                             vmin=VMIN,
                             vmax=VMAX)

    plt.tight_layout(pad=0)
    plt.gca().set_axis_off()
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
    plt.savefig(output_image_filepath, bbox_inches='tight', pad_inches=0)
    plt.close()

In [18]:
SOURCE_AUDIO_DIR = "audio/audio_processed"
OUTPUT_IMAGE_DIR = "audio/sparse_spectrograms"

IMAGE_SIZE_PX = 200
IMAGE_DPI = 100
FIG_SIZE_INCHES = IMAGE_SIZE_PX / IMAGE_DPI

# SPECTROGRAM PARAMETERS
# Determines the lowest dB level that will still show up as a pattern (-30 dB for sparse, thin lines)
SPARSENESS_THRESHOLD_DB = -30
# Reduced hop_length for better time resolution (horizontal density)
SPECTROGRAM_HOP_LENGTH = 256

print("--- Starting Sparse Spectrogram Image Generation ---")

os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
print(f"Output Image Directory: {OUTPUT_IMAGE_DIR}")
print(f"Sparseness Threshold (VMIN): {SPARSENESS_THRESHOLD_DB} dB")

for filename in os.listdir(SOURCE_AUDIO_DIR):
    if filename.endswith((".wav")):
        audio_filepath = os.path.join(SOURCE_AUDIO_DIR, filename)
        image_filename = os.path.splitext(filename)[0] + ".png"
        output_image_filepath = os.path.join(OUTPUT_IMAGE_DIR, image_filename)

        create_sparse_spectrogram_image(
            audio_filepath,
            output_image_filepath,
            hop_length=SPECTROGRAM_HOP_LENGTH,
            db_range_min=SPARSENESS_THRESHOLD_DB,
            figsize=(FIG_SIZE_INCHES, FIG_SIZE_INCHES),
            dpi=IMAGE_DPI
        )

print(f"\n--- Sparse Spectrogram Generation Complete ---")

--- Starting Sparse Spectrogram Image Generation ---
Output Image Directory: audio/sparse_spectrograms
Sparseness Threshold (VMIN): -30 dB

--- Sparse Spectrogram Generation Complete ---
