## Import Libraries 📚

In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # "jax" or "tensorflow" or "torch"

import keras_cv
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
import ipywidgets as widgets

# Set interactive backend
%matplotlib inline


cmap = mpl.cm.get_cmap('coolwarm')

  cmap = mpl.cm.get_cmap('coolwarm')


### Data Exploration 💥💥

In [2]:
DATASET_PATH = 'content/birdclef-2024'

#### View the species name and construct a dictionary to hold their values 📃

In [3]:
class_names = sorted(os.listdir(f"{DATASET_PATH}/train_audio/"))
num_classes = len(class_names)
class_labels = list(range(num_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [4]:
## Print out the first 5 items in the label2name and name2label dictionaries
print(f"Number of classes: {num_classes}")
print({k: label2name[k] for k in list(label2name)[:5]})
print({k: name2label[k] for k in list(name2label)[:5]})

Number of classes: 182
{0: 'asbfly', 1: 'ashdro1', 2: 'ashpri1', 3: 'ashwoo2', 4: 'asikoe2'}
{'asbfly': 0, 'ashdro1': 1, 'ashpri1': 2, 'ashwoo2': 3, 'asikoe2': 4}


### Load the dataframe 🔃

In [5]:
df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
df['filepath'] = DATASET_PATH + '/train_audio/' + df.filename
df['target'] = df.primary_label.map(name2label)
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df['xc_id'] = df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])

## display a few rows
df.sample(5)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,filepath,target,xc_id
8424,crseag1,[],"['call', 'flight call']",11.4081,107.4119,Spilornis cheela,Crested Serpent-Eagle,Jelle Scharringa,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/665788,XC665788.ogg,content/birdclef-2024/train_audio/crseag1/XC66...,50,XC665788
24021,zitcis1,[],"['male', 'song']",37.4128,-5.916,Cisticola juncidis,Zitting Cisticola,José Carlos Sires,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/268680,XC268680.ogg,content/birdclef-2024/train_audio/zitcis1/XC26...,181,XC268680
15356,ingori1,[],"['adult', 'male', 'song']",21.7177,79.3275,Oriolus kundoo,Indian Golden Oriole,Rajgopal Patil,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/571750,XC571750.ogg,content/birdclef-2024/train_audio/ingori1/XC57...,88,XC571750
14861,houspa,[],['call'],37.1551,-7.6945,Passer domesticus,House Sparrow,Nelson Conceição,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/527803,XC527803.ogg,content/birdclef-2024/train_audio/houspa/XC527...,82,XC527803
6740,compea,[],['song'],28.5057,77.2202,Pavo cristatus,Indian Peafowl,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/116747,XC116747.ogg,content/birdclef-2024/train_audio/compea/XC116...,42,XC116747


### Function to retreive an audio file 🎵
**librosa is a python package for music and audio analysis. It provides the building blocks necessary to create music information retrieval systems**
[Documentation here](https://librosa.org/doc/latest/index.html)

In [6]:
## Load the audio as a waveform `y`
# Store the sampling rate as `sr`
def load_audio(filepath):
    audio, sr = librosa.load(filepath)
    return audio, sr

### Get the audio spectrogram 🌊. 
**A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. When applied to an audio signal, spectrograms are sometimes called sonographs, voiceprints, or voicegrams**

In [7]:
# Define the sampling rate of the audio signal (32 kHz)
sample_rate = 32000

# Define the maximum frequency to include in the spectrogram (16 kHz)
fmax = 16000

# Define the minimum frequency to include in the spectrogram (20 Hz)
fmin = 20

# Function to compute the Mel-spectrogram of an audio signal
def get_spectrogram(audio):
    # Compute the Mel-spectrogram
    spec = librosa.feature.melspectrogram(
        y=audio,  # Input audio signal
        sr=sample_rate,  # Sampling rate of the audio
        n_mels=256,  # Number of Mel bands (frequency bins)
        n_fft=2048,  # Size of the FFT window (determines frequency resolution)
        hop_length=512,  # Number of samples between successive frames (determines time resolution)
        fmax=fmax,  # Maximum frequency to include in the spectrogram
        fmin=fmin,  # Minimum frequency to include in the spectrogram
    )

    # Convert the power spectrogram to decibel (dB) scale
    # This makes the values more perceptually meaningful
    spec = librosa.power_to_db(spec, ref=1.0)  # ref=1.0 is the reference value for dB calculation

    # Normalize the spectrogram to the range [0, 1]
    min_ = spec.min()  # Minimum value in the spectrogram
    max_ = spec.max()  # Maximum value in the spectrogram
    if max_ != min_:  # Avoid division by zero if the spectrogram is constant
        spec = (spec - min_) / (max_ - min_)  # Normalize using min-max scaling

    # Return the normalized Mel-spectrogram
    return spec

### Display a few audio files with spectograms and their associated df details ⚡

In [8]:
duration = 15
audio_len = duration * sample_rate
def display_audio(row):
    caption = f'Id: {row.filename} | Name: {row.common_name} | Sci.Name: {row.scientific_name}'
    
    audio, sr = load_audio(row.filepath)
    audio = audio[:audio_len]
    spec = get_spectrogram(audio)
    
    # Audio output widget
    audio_output = widgets.Output()
    with audio_output:
        display(ipd.Audio(audio, rate=sample_rate))
    
    # Plot output widget
    plot_output = widgets.Output()
    with plot_output:
        fig, ax = plt.subplots(2, 1, figsize=(12, 6), sharex=True, tight_layout=True)
        fig.suptitle(caption)
        
        # Plot waveform
        lid.waveshow(audio, sr=sample_rate, ax=ax[0], color='b')
        
        # Plot spectrogram
        lid.specshow(spec, sr=sample_rate, hop_length=512, n_fft=2048,
                     fmin=fmin, fmax=fmax, x_axis='time', y_axis='mel', 
                     cmap='coolwarm', ax=ax[1])
        
        ax[0].set_xlabel('')
        plt.show()

    # Display side-by-side
    display(widgets.HBox([audio_output, plot_output]))


In [9]:
## Display a few audio samples
for i in range(3):
    display_audio(df.sample(1).iloc[0])

HBox(children=(Output(), Output()))

HBox(children=(Output(), Output()))

HBox(children=(Output(), Output()))

### Build a decoder parse files into spectrograms🚀 

**The build_decoder() function constructs a decoder that can process audio files into spectrograms.
It loads, normalizes, and converts the audio into a Mel-spectrogram.
If with_labels=True, it also converts labels into one-hot vectors.
The output is an RGB-like spectrogram image that can be used as input to CNNs.**
[Tensorflow Documentation here](https://www.tensorflow.org/io/api_docs/python/tfio/audio/spectrogram)

In [10]:
# Image and audio parameters
img_size = [128, 384]  # Spectrogram image size (height, width)
batch_size = 64  # Batch size for training
hop_length = audio_len // (img_size[1] - 1)  # Hop length for spectrogram computation
nfft = 2028  # FFT window size for computing the spectrogram

def build_decoder(with_labels=True, dim=1024):
    """
    Builds a function to decode and preprocess audio files into spectrograms.
    
    Parameters:
    - with_labels (bool): Whether to return labels along with spectrograms.
    - dim (int): Target audio length (number of samples).
    
    Returns:
    - Function to decode audio files (with or without labels).
    """

    def get_audio(filepath):
        """Loads and decodes an audio file from a given filepath."""
        file_bytes = tf.io.read_file(filepath)  # Read the audio file as bytes
        audio = tfio.audio.decode_vorbis(file_bytes)  # Decode .ogg Vorbis file
        audio = tf.cast(audio, tf.float32)  # Convert to float32

        # Convert stereo to mono by selecting only one channel
        if tf.shape(audio)[1] > 1:
            audio = audio[..., 0:1]
        audio = tf.squeeze(audio, axis=-1)  # Remove redundant dimensions
        return audio

    def crop_or_pad(audio, target_len, pad_mode="constant"):
        """Ensures the audio is of fixed length by either cropping or padding."""
        audio_len = tf.shape(audio)[0]  # Get current length of audio
        diff_len = abs(target_len - audio_len)  # Difference from target length

        if audio_len < target_len:
            # If audio is shorter, pad it randomly on both sides
            pad1 = tf.random.uniform([], maxval=diff_len, dtype=tf.int32)
            pad2 = diff_len - pad1
            audio = tf.pad(audio, paddings=[[pad1, pad2]], mode=pad_mode)

        elif audio_len > target_len:
            # If audio is longer, randomly crop a section
            idx = tf.random.uniform([], maxval=diff_len, dtype=tf.int32)
            audio = audio[idx : (idx + target_len)]

        return tf.reshape(audio, [target_len])  # Ensure fixed shape

    def apply_preproc(spec):
        """Applies standardization and normalization to the spectrogram."""
        # Standardization: Zero mean and unit variance
        mean = tf.math.reduce_mean(spec)
        std = tf.math.reduce_std(spec)
        spec = tf.where(tf.math.equal(std, 0), spec - mean, (spec - mean) / std)

        # Min-Max Normalization: Scale values between 0 and 1
        min_val = tf.math.reduce_min(spec)
        max_val = tf.math.reduce_max(spec)
        spec = tf.where(
            tf.math.equal(max_val - min_val, 0), 
            spec - min_val, 
            (spec - min_val) / (max_val - min_val)
        )

        return spec

    def get_target(target):
        """Converts a label into a one-hot encoded vector."""
        target = tf.reshape(target, [1])  # Reshape to single element tensor
        target = tf.cast(tf.one_hot(target, num_classes), tf.float32)  # One-hot encoding
        return tf.reshape(target, [num_classes])  # Reshape to match the output format

    def decode(path):
        """Processes an audio file into a spectrogram image."""
        # Load and preprocess the audio
        audio = get_audio(path)
        audio = crop_or_pad(audio, dim)  # Ensure fixed length
        
        # Convert audio to a Mel-spectrogram
        spec = keras.layers.MelSpectrogram(
            num_mel_bins=img_size[0],  # Number of Mel frequency bins (height of image)
            fft_length=nfft,  # FFT window size
            sequence_stride=hop_length,  # Step size between spectrogram columns
            sampling_rate=sample_rate,  # Sample rate of audio
        )(audio)

        spec = apply_preproc(spec)  # Apply normalization and standardization
        
        # Convert spectrogram into a 3-channel image (for compatibility with CNNs)
        spec = tf.tile(spec[..., None], [1, 1, 3])  # Repeat values along the last axis
        return tf.reshape(spec, [*img_size, 3])  # Reshape to (height, width, 3)

    def decode_with_labels(path, label):
        """Processes an audio file into a spectrogram and returns it with its label."""
        return decode(path), get_target(label)

    return decode_with_labels if with_labels else decode


### Image Augmentation ♻
##### augmentation involves applying a variety of transformations to the original dataset, generating new samples that are similar but not identical to the original data. Common augmentations include rotation, flipping, scaling, changes in brightness and contrast, color space adjustments, and geometric transformations

In [11]:
def build_augmenter():
    """
    Creates an augmentation pipeline for spectrogram images.
    Uses MixUp, time masking, and frequency masking to improve model generalization.
    
    Returns:
        A function that applies random augmentations to images and labels.
    """

    # Define a list of augmentation techniques to apply
    augmenters = [
        keras_cv.layers.MixUp(alpha=0.4),  # MixUp augmentation for blending two images
        keras_cv.layers.RandomCutout(
            height_factor=(1.0, 1.0), width_factor=(0.06, 0.12)
        ),  # Time-masking: Randomly removes sections along the time axis
        keras_cv.layers.RandomCutout(
            height_factor=(0.06, 0.1), width_factor=(1.0, 1.0)
        ),  # Frequency-masking: Randomly removes sections along the frequency axis
    ]

    def augment(img, label):
        """
        Applies the augmentation pipeline to an image-label pair.

        Args:
            img (tf.Tensor): Input spectrogram image.
            label (tf.Tensor): Corresponding label for the image.

        Returns:
            Augmented image and label.
        """

        # Wrap image and label in a dictionary for compatibility with keras_cv augmenters
        data = {"images": img, "labels": label}

        # Apply augmentations with a 35% probability for each augmenter
        for augmenter in augmenters:
            if tf.random.uniform([]) < 0.35:
                data = augmenter(data, training=True)

        # Extract and return augmented image and label
        return data["images"], data["labels"]

    return augment

### Build the dataset for training 💰


In [12]:
seed = 42
def build_dataset(
    paths, 
    labels=None, 
    batch_size=32,
    decode_fn=None, 
    augment_fn=None, 
    cache=True,
    augment=False, 
    shuffle=2048
):
    """
    Builds a TensorFlow dataset pipeline for audio processing.

    Args:
        paths (list or tf.Tensor): List of file paths to audio files.
        labels (list or tf.Tensor, optional): Corresponding labels for classification. Defaults to None.
        batch_size (int, optional): Number of samples per batch. Defaults to 32.
        decode_fn (function, optional): Function to decode audio files. Defaults to None.
        augment_fn (function, optional): Function to apply augmentations. Defaults to None.
        cache (bool, optional): Whether to cache the dataset in memory. Defaults to True.
        augment (bool, optional): Whether to apply data augmentation. Defaults to False.
        shuffle (int or bool, optional): Buffer size for shuffling. Set to False to disable shuffling. Defaults to 2048.

    Returns:
        tf.data.Dataset: Preprocessed dataset ready for training.
    """

    # Use default decoder if none is provided
    if decode_fn is None:
        decode_fn = build_decoder(with_labels=(labels is not None), dim=audio_len)

    # Use default augmentation function if none is provided
    if augment_fn is None:
        augment_fn = build_augmenter()

    # Set automatic tuning for dataset performance optimization
    AUTO = tf.data.experimental.AUTOTUNE

    # Create dataset from file paths (with or without labels)
    slices = (paths,) if labels is None else (paths, labels)
    print(f"Labels: {labels}")
    ds = tf.data.Dataset.from_tensor_slices(slices)

    # Apply decoding function to process audio files
    ds = ds.map(decode_fn, num_parallel_calls=AUTO)

    # Cache dataset in memory to speed up subsequent iterations
    if cache:
        ds = ds.cache()

    # Shuffle dataset if required
    if shuffle:
        opt = tf.data.Options()
        ds = ds.shuffle(shuffle, seed=seed)  # Shuffle with seed for reproducibility
        opt.experimental_deterministic = False  # Improve performance by allowing non-deterministic order
        ds = ds.with_options(opt)

    # Batch dataset with a fixed size, ensuring even batch sizes
    ds = ds.batch(batch_size, drop_remainder=True)

    # Apply augmentation if enabled
    if augment:
        ds = ds.map(augment_fn, num_parallel_calls=AUTO)

    # Prefetch data to improve training performance
    ds = ds.prefetch(AUTO)

    return ds


### Split the dataset to a test and train set 🚂
***We used a test size of 0.2***

In [13]:
## Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size=0.2)

print(f"Num Train: {len(train_df)} | Num Valid: {len(valid_df)}")


Num Train: 19567 | Num Valid: 4892


In [None]:
# Prepare training dataset
train_paths = train_df.filepath.values  # Extract file paths from training DataFrame
train_labels = train_df.target.values   # Extract corresponding labels

train_ds = build_dataset(
    paths=train_paths, 
    labels=train_labels, 
    batch_size=batch_size,
    shuffle=True,  # Enable shuffling for training dataset
    augment=True  # Apply augmentation for training dataset
)

# Prepare validation dataset
valid_paths = valid_df.filepath.values  # Extract file paths from validation DataFrame
valid_labels = valid_df.target.values   # Extract corresponding labels

valid_ds = build_dataset(
    paths=valid_paths, 
    labels=valid_labels, 
    batch_size=batch_size,
    shuffle=False,  # No shuffling for validation to ensure consistency
    augment=False  # No augmentation for validation dataset
)
