## Import Libraries 📚

In [2]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # "jax" or "tensorflow" or "torch"

import keras_cv
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl

cmap = mpl.cm.get_cmap('coolwarm')

  from .autonotebook import tqdm as notebook_tqdm
  cmap = mpl.cm.get_cmap('coolwarm')


### Data Exploration 💥💥

In [3]:
DATASET_PATH = 'content/birdclef-2024'

#### View the species name and construct a dictionary to hold their values 📃

In [9]:
class_names = sorted(os.listdir(f"{DATASET_PATH}/train_audio/"))
num_classes = len(class_names)
class_labels = list(range(num_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [11]:
## Print out the first 5 items in the label2name and name2label dictionaries
print({k: label2name[k] for k in list(label2name)[:5]})
print({k: name2label[k] for k in list(name2label)[:5]})

{0: 'asbfly', 1: 'ashdro1', 2: 'ashpri1', 3: 'ashwoo2', 4: 'asikoe2'}
{'asbfly': 0, 'ashdro1': 1, 'ashpri1': 2, 'ashwoo2': 3, 'asikoe2': 4}


### Load the dataframe 🔃

In [6]:
df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
df['filepath'] = DATASET_PATH + '/train_audio/' + df.filename
df['target'] = df.primary_label.map(name2label)
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df['xc_id'] = df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])

## display a few rows
df.sample(5)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,filepath,target,xc_id
9526,eucdov,[],[''],48.7589,2.4047,Streptopelia decaocto,Eurasian Collared-Dove,Pablo Bolaños Sittler,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/816462,XC816462.ogg,content/birdclef-2024/train_audio/eucdov/XC816...,55,XC816462
17384,litgre1,[],['call'],47.9801,10.9586,Tachybaptus ruficollis,Little Grebe,Beatrix Saadi-Varchmin,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/326376,XC326376.ogg,content/birdclef-2024/train_audio/litgre1/XC32...,107,XC326376
8317,crseag1,[],['song'],,,Spilornis cheela,Crested Serpent-Eagle,Heru Cahyono,Creative Commons Attribution-NonCommercial-Sha...,1.5,https://www.xeno-canto.org/144914,XC144914.ogg,content/birdclef-2024/train_audio/crseag1/XC14...,50,XC144914
7548,comsan,[],['nocturnal flight call'],42.3285,2.5512,Actitis hypoleucos,Common Sandpiper,Robert Manzano,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/590100,XC590100.ogg,content/birdclef-2024/train_audio/comsan/XC590...,44,XC590100
1729,bkcbul1,[],[''],11.8486,75.9394,Rubigula gularis,Flame-throated Bulbul,JISHNU KIZHAKKILLAM,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/783987,XC783987.ogg,content/birdclef-2024/train_audio/bkcbul1/XC78...,11,XC783987


### Function to retreive an audio file 🎵
**librosa is a python package for music and audio analysis. It provides the building blocks necessary to create music information retrieval systems**
[Documentation here](https://librosa.org/doc/latest/index.html)

In [12]:
## Load the audio as a waveform `y`
# Store the sampling rate as `sr`
def load_audio(filepath):
    audio, sr = librosa.load(filepath)
    return audio, sr

### Get the audio spectrogram 🌊. 
**A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. When applied to an audio signal, spectrograms are sometimes called sonographs, voiceprints, or voicegrams**

In [14]:
# Define the sampling rate of the audio signal (32 kHz)
sample_rate = 32000

# Define the maximum frequency to include in the spectrogram (16 kHz)
fmax = 16000

# Define the minimum frequency to include in the spectrogram (20 Hz)
fmin = 20

# Function to compute the Mel-spectrogram of an audio signal
def get_spectrogram(audio):
    # Compute the Mel-spectrogram
    spec = librosa.feature.melspectrogram(
        y=audio,  # Input audio signal
        sr=sample_rate,  # Sampling rate of the audio
        n_mels=256,  # Number of Mel bands (frequency bins)
        n_fft=2048,  # Size of the FFT window (determines frequency resolution)
        hop_length=512,  # Number of samples between successive frames (determines time resolution)
        fmax=fmax,  # Maximum frequency to include in the spectrogram
        fmin=fmin,  # Minimum frequency to include in the spectrogram
    )

    # Convert the power spectrogram to decibel (dB) scale
    # This makes the values more perceptually meaningful
    spec = librosa.power_to_db(spec, ref=1.0)  # ref=1.0 is the reference value for dB calculation

    # Normalize the spectrogram to the range [0, 1]
    min_ = spec.min()  # Minimum value in the spectrogram
    max_ = spec.max()  # Maximum value in the spectrogram
    if max_ != min_:  # Avoid division by zero if the spectrogram is constant
        spec = (spec - min_) / (max_ - min_)  # Normalize using min-max scaling

    # Return the normalized Mel-spectrogram
    return spec