## Import Libraries 📚

In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # "jax" or "tensorflow" or "torch"

import keras_cv
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
import ipywidgets as widgets

# Set interactive backend
%matplotlib inline


cmap = mpl.cm.get_cmap('coolwarm')

  cmap = mpl.cm.get_cmap('coolwarm')


### Data Exploration 💥💥

In [2]:
DATASET_PATH = 'content/birdclef-2024'

#### View the species name and construct a dictionary to hold their values 📃

In [3]:
class_names = sorted(os.listdir(f"{DATASET_PATH}/train_audio/"))
num_classes = len(class_names)
class_labels = list(range(num_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [4]:
## Print out the first 5 items in the label2name and name2label dictionaries
print({k: label2name[k] for k in list(label2name)[:5]})
print({k: name2label[k] for k in list(name2label)[:5]})

{0: 'asbfly', 1: 'ashdro1', 2: 'ashpri1', 3: 'ashwoo2', 4: 'asikoe2'}
{'asbfly': 0, 'ashdro1': 1, 'ashpri1': 2, 'ashwoo2': 3, 'asikoe2': 4}


### Load the dataframe 🔃

In [5]:
df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
df['filepath'] = DATASET_PATH + '/train_audio/' + df.filename
df['target'] = df.primary_label.map(name2label)
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df['xc_id'] = df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])

## display a few rows
df.sample(5)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,filepath,target,xc_id
11566,grejun2,[],['call'],18.446,73.4266,Gallus sonneratii,Gray Junglefowl,Rajgopal Patil,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/574339,XC574339.ogg,content/birdclef-2024/train_audio/grejun2/XC57...,68,XC574339
17302,litegr,[],[''],22.4939,114.0404,Egretta garzetta,Little Egret,Geoff Carey,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/795967,XC795967.ogg,content/birdclef-2024/train_audio/litegr/XC795...,106,XC795967
8716,eaywag1,[],['nocturnal flight call'],39.3131,-9.2168,Motacilla flava,Western Yellow Wagtail,Helder Cardoso,Creative Commons Attribution-NonCommercial-Sha...,1.0,https://www.xeno-canto.org/586874,XC586874.ogg,content/birdclef-2024/train_audio/eaywag1/XC58...,53,XC586874
22882,whcbar1,[],['call'],13.3193,74.8111,Psilopogon viridis,White-cheeked Barbet,ARUN PRABHU,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/354627,XC354627.ogg,content/birdclef-2024/train_audio/whcbar1/XC35...,173,XC354627
10083,forwag1,['comtai1'],['call'],13.6596,79.4575,Dendronanthus indicus,Forest Wagtail,SUYASH SAWANT,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/548380,XC548380.ogg,content/birdclef-2024/train_audio/forwag1/XC54...,58,XC548380


### Function to retreive an audio file 🎵
**librosa is a python package for music and audio analysis. It provides the building blocks necessary to create music information retrieval systems**
[Documentation here](https://librosa.org/doc/latest/index.html)

In [6]:
## Load the audio as a waveform `y`
# Store the sampling rate as `sr`
def load_audio(filepath):
    audio, sr = librosa.load(filepath)
    return audio, sr

### Get the audio spectrogram 🌊. 
**A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. When applied to an audio signal, spectrograms are sometimes called sonographs, voiceprints, or voicegrams**

In [7]:
# Define the sampling rate of the audio signal (32 kHz)
sample_rate = 32000

# Define the maximum frequency to include in the spectrogram (16 kHz)
fmax = 16000

# Define the minimum frequency to include in the spectrogram (20 Hz)
fmin = 20

# Function to compute the Mel-spectrogram of an audio signal
def get_spectrogram(audio):
    # Compute the Mel-spectrogram
    spec = librosa.feature.melspectrogram(
        y=audio,  # Input audio signal
        sr=sample_rate,  # Sampling rate of the audio
        n_mels=256,  # Number of Mel bands (frequency bins)
        n_fft=2048,  # Size of the FFT window (determines frequency resolution)
        hop_length=512,  # Number of samples between successive frames (determines time resolution)
        fmax=fmax,  # Maximum frequency to include in the spectrogram
        fmin=fmin,  # Minimum frequency to include in the spectrogram
    )

    # Convert the power spectrogram to decibel (dB) scale
    # This makes the values more perceptually meaningful
    spec = librosa.power_to_db(spec, ref=1.0)  # ref=1.0 is the reference value for dB calculation

    # Normalize the spectrogram to the range [0, 1]
    min_ = spec.min()  # Minimum value in the spectrogram
    max_ = spec.max()  # Maximum value in the spectrogram
    if max_ != min_:  # Avoid division by zero if the spectrogram is constant
        spec = (spec - min_) / (max_ - min_)  # Normalize using min-max scaling

    # Return the normalized Mel-spectrogram
    return spec

### Display a few audio files with spectograms and their associated df details ⚡

In [11]:
duration = 15
audio_len = duration * sample_rate
def display_audio(row):
    caption = f'Id: {row.filename} | Name: {row.common_name} | Sci.Name: {row.scientific_name}'
    
    audio, sr = load_audio(row.filepath)
    audio = audio[:audio_len]
    spec = get_spectrogram(audio)
    
    # Audio output widget
    audio_output = widgets.Output()
    with audio_output:
        display(ipd.Audio(audio, rate=sample_rate))
    
    # Plot output widget
    plot_output = widgets.Output()
    with plot_output:
        fig, ax = plt.subplots(2, 1, figsize=(12, 6), sharex=True, tight_layout=True)
        fig.suptitle(caption)
        
        # Plot waveform
        lid.waveshow(audio, sr=sample_rate, ax=ax[0], color='b')
        
        # Plot spectrogram
        lid.specshow(spec, sr=sample_rate, hop_length=512, n_fft=2048,
                     fmin=fmin, fmax=fmax, x_axis='time', y_axis='mel', 
                     cmap='coolwarm', ax=ax[1])
        
        ax[0].set_xlabel('')
        plt.show()

    # Display side-by-side
    display(widgets.HBox([audio_output, plot_output]))


In [12]:
## Display a few audio samples
for i in range(3):
    display_audio(df.sample(1).iloc[0])

HBox(children=(Output(), Output()))

HBox(children=(Output(), Output()))

HBox(children=(Output(), Output()))