# Sauim bicolor Vocalization Classifier and Analysis

This notebook demonstrates how to use the Perch bird vocalization classifier model from TensorFlow Hub to extract embeddings from audio files of Sauim bicolor primate and perform analysis on the extracted embeddings, such as hierarchical clustering.

The notebook covers the following steps:

1.  **Setup**: Installing necessary libraries and loading the Perch model.
2.  **Data Loading**: Downloading example audio files and a CSV file containing labels.
3.  **Feature Extraction**: Defining and using a function to extract embeddings and spectrograms from audio segments.
4.  **Spectrogram Visualization**: Plotting spectrograms to visualize the audio data.
5.  **Clustering Analysis**: Performing hierarchical clustering on the extracted embeddings and visualizing the results as a dendrogram.

In [1]:
%pip install tensorflow_hub -q

In [2]:
import csv
import librosa
import gdown
import zipfile
import shutil
import os
import soundfile as sf
from tqdm.notebook import trange, tqdm
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import LabelEncoder
import tensorflow_hub as hub
import tensorflow as tf
tf.experimental.numpy.experimental_enable_numpy_behavior()

# Load Perch model.
model = hub.load('https://www.kaggle.com/models/google/bird-vocalization-classifier/TensorFlow2/bird-vocalization-classifier/8')

In [3]:
# Remove the directory and all its contents
if os.path.exists('Parque do Mindú'):
    shutil.rmtree('Parque do Mindú')
    print("Folder 'Parque do Mindú' removed.")

if os.path.exists('MinduPark_38'):
    shutil.rmtree('MinduPark_38')
    print("Folder 'MinduPark_38' removed.")

Automatically download the necessary files

In [4]:
!wget https://github.com/juancolonna/Sauim/raw/main/records/records.csv -O records.csv
!wget https://github.com/juancolonna/Sauim/raw/main/records/Sauim.wav -O Sauim.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Anurans.wav -O Anurans.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Anthrophony.wav -O Anthrophony.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Background.wav -O Background.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Birds.wav -O Birds.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Geophony.wav -O Geophony.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Mindu_Saguinus%20bicolor_02.02.19-000.wav -O Mindu_Saguinus_bicolor_02.02.19-000.wav
!wget https://github.com/juancolonna/Sauim/raw/main/records/Mindu_Saguinus_bicolor_02.02.19-000_bandpass_filtered.wav -O Mindu_Saguinus_bicolor_02.02.19-000_bandpass_filtered.wav


!wget https://github.com/juancolonna/Sauim/raw/main/records/Parque_do_Mindú.zip -O Parque_do_Mindú.zip
!unzip -q Parque_do_Mindú.zip

!wget https://github.com/juancolonna/Sauim/raw/main/records/Parque_do_Mindu_38.zip -O Parque_do_Mindu_38.zip
!unzip -q Parque_do_Mindu_38.zip -d Parque_do_Mindu_38

--2025-08-27 02:33:54--  https://github.com/juancolonna/Sauim/raw/main/records/records.csv
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/juancolonna/Sauim/main/records/records.csv [following]
--2025-08-27 02:33:54--  https://raw.githubusercontent.com/juancolonna/Sauim/main/records/records.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4060 (4.0K) [text/plain]
Saving to: ‘records.csv’


2025-08-27 02:33:55 (63.4 MB/s) - ‘records.csv’ saved [4060/4060]

--2025-08-27 02:33:55--  https://github.com/juancolonna/Sauim/raw/main/records/Sauim.wav
Resolving github.com (github.com)... 20.205.24

In [5]:
def extract_embeddings(file: str, wlen: int, hop_len: int):
    """
    Extracts embedding vectors and spectrograms from an audio file.

    Args:
        file (str): Path to the audio file.
        wlen (int): Window length in samples.
        hop_len (int): Hop length in samples.

    Returns:
        tuple: (list of embedding vectors, list of spectrogram arrays)

    Example:
        >>> sr = 32000  # Sampling rate in Hz
        >>> window_len = int(5 * sr)  # 5-second window
        >>> emb, specs = extract_embeddings('Sauim.wav', wlen=window_len, hop_len=window_len)
        >>> np.save('sauim_embedding_vectors.npy', emb)
        >>> np.save('sauim_spectrograms.npy', specs)
    """

    y, sr = librosa.load(file, sr=32000)  # Load audio
    y = y / np.max(np.abs(y)) # Normalize amplitude

    embedding_vectors = []
    spectrograms = []

    for time in tqdm(range(0, len(y), hop_len)):
        if time + wlen < len(y):
            segment = y[time:time+wlen]
            model_outputs = model.infer_tf(segment[np.newaxis, :])
            embedding_vectors.append(model_outputs['embedding'].numpy()[0])
            spectrograms.append(model_outputs['frontend'][0].T.numpy())

    return embedding_vectors, spectrograms

def plot_spectrogram(species: str, spectrograms):
    """
    Plots a Mel spectrogram in dB scale for a given species.

    Args:
        species (str): Species name.
        spectrograms (ndarray): Spectrogram array.
        sr (int): Sampling rate in Hz.

    Example:
        >>> sr = 32000
        >>> window_len = int(5 * sr)
        >>> emb, specs = extract_embeddings('Sauim.wav', wlen=window_len, hop_len=window_len)
        >>> # Plot the first extracted spectrogram
        >>> plot_spectrogram("Saguinus bicolor", specs[0], sr)
    """
    S_dB = librosa.power_to_db(spectrograms, ref=np.max) # Convert to decibels

    plt.figure(figsize=(10, 3))
    librosa.display.specshow(S_dB, sr=32000, x_axis='time', y_axis='mel',
                            hop_length=256+64, # Provide hop_length for correct time axis
                            cmap='viridis') # Ensure origin is lower for standard freq plot

    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Spectrogram of a 5-second segment of {species}')
    plt.xlabel('Time (s)')
    plt.ylabel('Mel Frequency (Hz)') # Librosa will automatically format this as Mel-Hz

    plt.tight_layout()
    plt.show()

Extract embeddings and spectrograms for a 'Sauim.wav' audio file and save them as NumPy arrays.

In [6]:
sr = 32000 # Sampling rate (Hz)
window_len = int(5*sr) # 5-second window length in samples

In [None]:
# Extract embeddings and spectrograms for each 5-second segment
sauim_embedding_vectors, sauim_spectrograms = extract_embeddings('Sauim.wav',
                                                     wlen=window_len,
                                                     hop_len=window_len)

# Save outputs to .npy files
np.save('sauim_embedding_vectors.npy', sauim_embedding_vectors)
np.save('sauim_spectrograms.npy', sauim_spectrograms)

In [None]:
plot_spectrogram('sauim call', sauim_spectrograms[2])

Background noise segments for anomaly data

In [None]:
anomaly_embedding_vectors = []
anomaly_spectrograms = []

anomaly_embedding_vectors, anomaly_spectrograms = extract_embeddings('Background.wav',
                                                     wlen=window_len,
                                                     hop_len=window_len)

np.save('background_embedding_vectors.npy', anomaly_embedding_vectors)
np.save('background_spectrograms.npy', anomaly_spectrograms)

In [None]:
plot_spectrogram('background noise', anomaly_spectrograms[0])

Anuran calls used as negative class

In [None]:
anomaly_anurans_vectors = []
anomaly_anurans_spectrograms = []

anomaly_anurans_vectors, anomaly_anurans_spectrograms = extract_embeddings('Anurans.wav',
                                                     wlen=window_len,
                                                     hop_len=int(window_len/2))

np.save('anurans_embedding_vectors.npy', anomaly_anurans_vectors)
np.save('anurans_spectrograms.npy', anomaly_anurans_spectrograms)

In [None]:
plot_spectrogram('anuran call', anomaly_anurans_spectrograms[0])

Bird calls used as negative class

In [None]:
anomaly_birds_vectors = []
anomaly_birds_spectrograms = []

anomaly_birds_vectors, anomaly_birds_spectrograms = extract_embeddings('Birds.wav',
                                                     wlen=window_len,
                                                     hop_len=window_len)

np.save('birds_embedding_vectors.npy', anomaly_birds_vectors)
np.save('birds_spectrograms.npy', anomaly_birds_spectrograms)

In [None]:
plot_spectrogram('birds call', anomaly_birds_spectrograms[0])

A soundscape record used for new sauim detections

In [None]:
soundscape_vectors = []
soundscape_spectrograms = []

step = 1
hop_len = int(step*sr) # emulates an sliding window of 5 sec length and 1 sec step

soundscape_vectors, soundscape_spectrograms = extract_embeddings('Mindu_Saguinus_bicolor_02.02.19-000.wav',
                                                     wlen=window_len,
                                                     hop_len=hop_len)

np.save('soundscape_embedding_vectors.npy', soundscape_vectors)
np.save('soundscape_spectrograms.npy', soundscape_spectrograms)

In [7]:
soundscape_vectors_filtered = []
soundscape_spectrograms_filtered = []

step = 1
hop_len = int(step*sr) # emulates an sliding window of 5 sec length and 1 sec step

soundscape_vectors_filtered, soundscape_spectrograms_filtered = extract_embeddings('Mindu_Saguinus_bicolor_02.02.19-000_bandpass_filtered.wav',
                                                     wlen=window_len,
                                                     hop_len=hop_len)

np.save('soundscape_embedding_vectors_filtered.npy', soundscape_vectors_filtered)
np.save('soundscape_spectrograms_filtered.npy', soundscape_spectrograms_filtered)

  0%|          | 0/564 [00:00<?, ?it/s]

In [None]:
anthrophony_vectors = []
anthrophony_spectrograms = []

step = 2
hop_len = int(step*sr) # emulates an sliding window of 5 sec length and 1 sec step

anthrophony_vectors, anthrophony_spectrograms = extract_embeddings('Anthrophony.wav',
                                                     wlen=window_len,
                                                     hop_len=hop_len)

np.save('anthrophony_embedding_vectors.npy', anthrophony_vectors)
np.save('anthrophony_spectrograms.npy', anthrophony_spectrograms)

In [None]:
geophony_vectors = []
geophony_spectrograms = []

step = 2
hop_len = int(step*sr) # emulates an sliding window of 5 sec length and 1 sec step

geophony_vectors, geophony_spectrograms = extract_embeddings('Geophony.wav',
                                                     wlen=window_len,
                                                     hop_len=hop_len)

np.save('geophony_embedding_vectors.npy', geophony_vectors)
np.save('geophony_spectrograms.npy', geophony_spectrograms)

## Audio files for the Ocupancy model

In [None]:
os.makedirs("MinduPark", exist_ok=True)  # cria se não existir

step = 2
hop_len = int(step*sr) # emulates an sliding window of 5 sec length and 1 sec step

for i, file in enumerate(os.listdir('Parque do Mindú')):
    MinduPark_vectors = []
    MinduPark_spectrograms = []

    MinduPark_vectors, MinduPark_spectrograms = extract_embeddings("Parque do Mindú/"+file,
                                                        wlen=window_len,
                                                        hop_len=hop_len)

    np.save('MinduPark/MinduPark_embedding_vectors_'+str(i)+'.npy', MinduPark_vectors)
    # np.save('MinduPark/MinduPark_spectrograms_'+str(i)+'.npy', MinduPark_spectrograms)

!zip -r MinduPark.zip MinduPark

In [None]:
os.makedirs("MinduPark_38", exist_ok=True)  # cria se não existir

step = 1
hop_len = int(step*sr) # emulates an sliding window of 5 sec length and 1 sec step

for i, file in enumerate(os.listdir('Parque_do_Mindu_38/Parque do Mindú')):
    MinduPark_vectors = []
    MinduPark_spectrograms = []

    MinduPark_vectors, MinduPark_spectrograms = extract_embeddings("Parque_do_Mindu_38/Parque do Mindú/"+file,
                                                        wlen=window_len,
                                                        hop_len=hop_len)

    np.save('MinduPark_38/MinduPark_embedding_vectors_'+str(i)+'.npy', MinduPark_vectors)

!zip -r MinduPark_38.zip MinduPark_38

Load labels, compute cosine distance matrix from embedding vectors, perform hierarchical clustering, and plot a dendrogram.

In [None]:
# 1. Load labels from CSV
records = pd.read_csv('records.csv')  # adjust path if needed
labels = records['Bosque_S.bicolor_12.4.19-000'].tolist()

# Encode labels as integers
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)

# 2. Compute cosine distance matrix (condensed form)
cosine_distances = pdist(sauim_embedding_vectors, metric='cosine')

# 3. Perform hierarchical clustering
Z = linkage(cosine_distances, method='ward')  # try 'ward', 'complete', etc.

# 4. Plot dendrogram
plt.figure(figsize=(20, 5))
dendrogram(Z, labels=encoded_labels, leaf_font_size=10)
plt.xlabel("Records segments with sauim vocalizations")
plt.ylabel("Cosine distances")
plt.tight_layout()
plt.savefig("dendongram.pdf", format='pdf', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
for i in zip(encoded_labels,labels):
    print(i)