## Import Libraries 📚

In [2]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # "jax" or "tensorflow" or "torch"

import keras_cv
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
import ipywidgets as widgets

# Set interactive backend
%matplotlib inline


cmap = mpl.cm.get_cmap('coolwarm')

  cmap = mpl.cm.get_cmap('coolwarm')


### Data Exploration 💥💥

In [3]:
DATASET_PATH = 'content/birdclef-2024'

#### View the species name and construct a dictionary to hold their values 📃

In [4]:
class_names = sorted(os.listdir(f"{DATASET_PATH}/train_audio/"))
num_classes = len(class_names)
class_labels = list(range(num_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [5]:
## Print out the first 5 items in the label2name and name2label dictionaries
print(f"Number of classes: {num_classes}")
print({k: label2name[k] for k in list(label2name)[:5]})
print({k: name2label[k] for k in list(name2label)[:5]})

Number of classes: 182
{0: 'asbfly', 1: 'ashdro1', 2: 'ashpri1', 3: 'ashwoo2', 4: 'asikoe2'}
{'asbfly': 0, 'ashdro1': 1, 'ashpri1': 2, 'ashwoo2': 3, 'asikoe2': 4}


### Load the dataframe 🔃

In [6]:
df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
df['filepath'] = DATASET_PATH + '/train_audio/' + df.filename
df['target'] = df.primary_label.map(name2label)
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df['xc_id'] = df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])

## display a few rows
df.sample(5)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,filepath,target,xc_id
23106,whiter2,[],[''],40.6828,0.8371,Chlidonias hybrida,Whiskered Tern,Pere Josa,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/761899,XC761899.ogg,content/birdclef-2024/train_audio/whiter2/XC76...,174,XC761899
16822,lirplo,[],[''],40.071,-4.288,Charadrius dubius,Little Ringed Plover,Javier García Sáez,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/735108,XC735108.ogg,content/birdclef-2024/train_audio/lirplo/XC735...,105,XC735108
12879,grtdro1,['asikoe2'],['song'],13.8549,100.4735,Dicrurus paradiseus,Greater Racket-tailed Drongo,Werzik,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/396411,XC396411.ogg,content/birdclef-2024/train_audio/grtdro1/XC39...,73,XC396411
21712,spoowl1,[],['call'],12.9912,80.2363,Athene brama,Spotted Owlet,Vivek Puliyeri,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/276818,XC276818.ogg,content/birdclef-2024/train_audio/spoowl1/XC27...,156,XC276818
414,asikoe2,[],"['call', 'female', 'male']",8.7055,81.1875,Eudynamys scolopaceus,Asian Koel,Hugo Wieleman,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/330496,XC330496.ogg,content/birdclef-2024/train_audio/asikoe2/XC33...,4,XC330496


### Function to retreive an audio file 🎵
**librosa is a python package for music and audio analysis. It provides the building blocks necessary to create music information retrieval systems**
[Documentation here](https://librosa.org/doc/latest/index.html)

In [7]:
## Load the audio as a waveform `y`
# Store the sampling rate as `sr`
def load_audio(filepath):
    audio, sr = librosa.load(filepath)
    return audio, sr

### Get the audio spectrogram 🌊. 
**A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. When applied to an audio signal, spectrograms are sometimes called sonographs, voiceprints, or voicegrams**

In [8]:
# Define the sampling rate of the audio signal (32 kHz)
sample_rate = 32000

# Define the maximum frequency to include in the spectrogram (16 kHz)
fmax = 16000

# Define the minimum frequency to include in the spectrogram (20 Hz)
fmin = 20

# Function to compute the Mel-spectrogram of an audio signal
def get_spectrogram(audio):
    # Compute the Mel-spectrogram
    spec = librosa.feature.melspectrogram(
        y=audio,  # Input audio signal
        sr=sample_rate,  # Sampling rate of the audio
        n_mels=256,  # Number of Mel bands (frequency bins)
        n_fft=2048,  # Size of the FFT window (determines frequency resolution)
        hop_length=512,  # Number of samples between successive frames (determines time resolution)
        fmax=fmax,  # Maximum frequency to include in the spectrogram
        fmin=fmin,  # Minimum frequency to include in the spectrogram
    )

    # Convert the power spectrogram to decibel (dB) scale
    # This makes the values more perceptually meaningful
    spec = librosa.power_to_db(spec, ref=1.0)  # ref=1.0 is the reference value for dB calculation

    # Normalize the spectrogram to the range [0, 1]
    min_ = spec.min()  # Minimum value in the spectrogram
    max_ = spec.max()  # Maximum value in the spectrogram
    if max_ != min_:  # Avoid division by zero if the spectrogram is constant
        spec = (spec - min_) / (max_ - min_)  # Normalize using min-max scaling

    # Return the normalized Mel-spectrogram
    return spec

### Display a few audio files with spectograms and their associated df details ⚡

In [9]:
duration = 15
audio_len = duration * sample_rate
def display_audio(row):
    caption = f'Id: {row.filename} | Name: {row.common_name} | Sci.Name: {row.scientific_name}'
    
    audio, sr = load_audio(row.filepath)
    audio = audio[:audio_len]
    spec = get_spectrogram(audio)
    
    # Audio output widget
    audio_output = widgets.Output()
    with audio_output:
        display(ipd.Audio(audio, rate=sample_rate))
    
    # Plot output widget
    plot_output = widgets.Output()
    with plot_output:
        fig, ax = plt.subplots(2, 1, figsize=(12, 6), sharex=True, tight_layout=True)
        fig.suptitle(caption)
        
        # Plot waveform
        lid.waveshow(audio, sr=sample_rate, ax=ax[0], color='b')
        
        # Plot spectrogram
        lid.specshow(spec, sr=sample_rate, hop_length=512, n_fft=2048,
                     fmin=fmin, fmax=fmax, x_axis='time', y_axis='mel', 
                     cmap='coolwarm', ax=ax[1])
        
        ax[0].set_xlabel('')
        plt.show()

    # Display side-by-side
    display(widgets.HBox([audio_output, plot_output]))


In [10]:
## Display a few audio samples
for i in range(3):
    display_audio(df.sample(1).iloc[0])

HBox(children=(Output(), Output()))

HBox(children=(Output(), Output()))

HBox(children=(Output(), Output()))

### Build a decoder parse files into spectrograms🚀 

**The build_decoder() function constructs a decoder that can process audio files into spectrograms.
It loads, normalizes, and converts the audio into a Mel-spectrogram.
If with_labels=True, it also converts labels into one-hot vectors.
The output is an RGB-like spectrogram image that can be used as input to CNNs.**
[Tensorflow Documentation here](https://www.tensorflow.org/io/api_docs/python/tfio/audio/spectrogram)

In [11]:
# Image and audio parameters
img_size = [128, 384]  # Spectrogram image size (height, width)
batch_size = 64  # Batch size for training
hop_length = audio_len // (img_size[1] - 1)  # Hop length for spectrogram computation
nfft = 2028  # FFT window size for computing the spectrogram

def build_decoder(with_labels=True, dim=1024):
    """
    Builds a function to decode and preprocess audio files into spectrograms.
    
    Parameters:
    - with_labels (bool): Whether to return labels along with spectrograms.
    - dim (int): Target audio length (number of samples).
    
    Returns:
    - Function to decode audio files (with or without labels).
    """

    def get_audio(filepath):
        """Loads and decodes an audio file from a given filepath."""
        file_bytes = tf.io.read_file(filepath)  # Read the audio file as bytes
        audio = tfio.audio.decode_vorbis(file_bytes)  # Decode .ogg Vorbis file
        audio = tf.cast(audio, tf.float32)  # Convert to float32

        # Convert stereo to mono by selecting only one channel
        if tf.shape(audio)[1] > 1:
            audio = audio[..., 0:1]
        audio = tf.squeeze(audio, axis=-1)  # Remove redundant dimensions
        return audio

    def crop_or_pad(audio, target_len, pad_mode="constant"):
        """Ensures the audio is of fixed length by either cropping or padding."""
        audio_len = tf.shape(audio)[0]  # Get current length of audio
        diff_len = abs(target_len - audio_len)  # Difference from target length

        if audio_len < target_len:
            # If audio is shorter, pad it randomly on both sides
            pad1 = tf.random.uniform([], maxval=diff_len, dtype=tf.int32)
            pad2 = diff_len - pad1
            audio = tf.pad(audio, paddings=[[pad1, pad2]], mode=pad_mode)

        elif audio_len > target_len:
            # If audio is longer, randomly crop a section
            idx = tf.random.uniform([], maxval=diff_len, dtype=tf.int32)
            audio = audio[idx : (idx + target_len)]

        return tf.reshape(audio, [target_len])  # Ensure fixed shape

    def apply_preproc(spec):
        """Applies standardization and normalization to the spectrogram."""
        # Standardization: Zero mean and unit variance
        mean = tf.math.reduce_mean(spec)
        std = tf.math.reduce_std(spec)
        spec = tf.where(tf.math.equal(std, 0), spec - mean, (spec - mean) / std)

        # Min-Max Normalization: Scale values between 0 and 1
        min_val = tf.math.reduce_min(spec)
        max_val = tf.math.reduce_max(spec)
        spec = tf.where(
            tf.math.equal(max_val - min_val, 0), 
            spec - min_val, 
            (spec - min_val) / (max_val - min_val)
        )

        return spec

    def get_target(target):
        """Converts a label into a one-hot encoded vector."""
        target = tf.reshape(target, [1])  # Reshape to single element tensor
        target = tf.cast(tf.one_hot(target, num_classes), tf.float32)  # One-hot encoding
        return tf.reshape(target, [num_classes])  # Reshape to match the output format

    def decode(path):
        """Processes an audio file into a spectrogram image."""
        # Load and preprocess the audio
        audio = get_audio(path)
        audio = crop_or_pad(audio, dim)  # Ensure fixed length
        
        # Convert audio to a Mel-spectrogram
        spec = keras.layers.MelSpectrogram(
            num_mel_bins=img_size[0],  # Number of Mel frequency bins (height of image)
            fft_length=nfft,  # FFT window size
            sequence_stride=hop_length,  # Step size between spectrogram columns
            sampling_rate=sample_rate,  # Sample rate of audio
        )(audio)

        spec = apply_preproc(spec)  # Apply normalization and standardization
        
        # Convert spectrogram into a 3-channel image (for compatibility with CNNs)
        spec = tf.tile(spec[..., None], [1, 1, 3])  # Repeat values along the last axis
        return tf.reshape(spec, [*img_size, 3])  # Reshape to (height, width, 3)

    def decode_with_labels(path, label):
        """Processes an audio file into a spectrogram and returns it with its label."""
        return decode(path), get_target(label)

    return decode_with_labels if with_labels else decode
