# BirdCLEF 2023 🐦
> Identify bird calls in soundscapes

<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/44224/logos/header.png?t=2023-03-06-18-30-53">

# Methodology  🎯
* This notebook will demonstrate **Bird Call Identification** with `TensorFlow`. Specifically, this notebook shows inference for [BirdCLEF23: Pretraining is All you Need [Train]](https://www.kaggle.com/awsaf49/birdclef23-pretraining-is-all-you-need-infer), where mdoel is initially **pretrained** on BirdCLEF 2021 & 2022 dataset, and then **fine-tuned** in BirdCLEF 2023 dataset.
* This notebook differs from previous [notebook](https://www.kaggle.com/awsaf49/birdclef23-effnet-fsr-cutmixup-train/) as unlike previous notebook where model is fed with spectrogram, current notebook feds  raw audio to model.
* Raw audio is processed using pre-processing layers from [tensorflow-extra](github.com/awsaf49/tensorflow_extra) library.
* This notebook will use `5sec` audio recording as per requirements. But training is done on much more larger size recording. Dynamic shape is utilize to infer on a different resolution.
* This notebook will consider one recording as one batch which will speed up the processing.

# Notebooks 📓

* Pretraining is All you Need
    * Train: [BirdCLEF23: Pretraining is All you Need [Train]](https://www.kaggle.com/awsaf49/birdclef23-pretraining-is-all-you-need-train/)
    * Infer: [BirdCLEF23: Pretraining is All you Need [Infer]](https://www.kaggle.com/awsaf49/birdclef23-pretraining-is-all-you-need-infer/)
    
    
* EffNet + FSR + CutMixUp
    * Train: [BirdCLEF23: EffNet + FSR + CutMixUp [Train]](https://www.kaggle.com/awsaf49/birdclef23-effnet-fsr-cutmixup-train/)
    * Infer: [BirdCLEF23: EffNet + FSR + CutMixUp [Infer]](https://www.kaggle.com/awsaf49/birdclef23-effnet-fsr-cutmixup-infer/)


# Update 🆕
* `v4`:
    * BirdCLEF 2020 & Xeno-Canto Extend Dataset added

# Install Libraries 🛠

In [None]:
import sys, os
sys.path.append('/kaggle/input/efficientnet-keras-dataset/efficientnet_kaggle')
!pip install -q /kaggle/input/tensorflow-extra-lib-ds/tensorflow_extra-1.0.2-py3-none-any.whl --no-deps

# Import Libraries 📚

In [None]:
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)
import os
import pandas as pd
import numpy as np
import random
from glob import glob
from tqdm import tqdm
tqdm.pandas()
import gc
import librosa
import sklearn
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
import librosa.display as lid
import IPython.display as ipd

import tensorflow as tf
tf.config.optimizer.set_jit(True) # enable xla for speed up
import tensorflow_io as tfio
import tensorflow.keras.backend as K

import efficientnet.tfkeras as efn
import tensorflow_extra as tfe

## Library Version

In [None]:
print('np:', np.__version__)
print('pd:', pd.__version__)
print('sklearn:', sklearn.__version__)
print('librosa:', librosa.__version__)
print('tf:', tf.__version__)
print('tfio:', tfio.__version__)

# Configuration ⚙️

In [None]:
class CFG:
    debug = False
    verbose = 0
    
    device = 'CPU'
    seed = 42
    
    # Input image size and batch size
    img_size = [128, 384]
    batch_size = 16
    infer_bs = 2
    tta = 1
    drop_remainder = True
    
    # STFT parameters
    duration = 5 # duration for test
    train_duration = 10
    sample_rate = 32000
    downsample = 1
    audio_len = duration*sample_rate
    nfft = 2028
    window = 2048
    hop_length = train_duration*32000 // (img_size[1] - 1)
    fmin = 20
    fmax = 16000
    normalize = True

    # Data Preprocessing Settings
    class_names = sorted(os.listdir('/kaggle/input/birdclef-2023/train_audio/'))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v:k for k,v in label2name.items()}
    
    target_col = ['target']
    tab_cols = ['filename','common_name','rate']

# Reproducibility ♻️
Sets value for random seed to produce similar result in each run.

In [None]:
tf.keras.utils.set_random_seed(CFG.seed)

# Set Up Device  📱
Following codes automatically detects hardware(tpu or tpu-vm or gpu). 

In [None]:
def get_device():
    "Detect and intializes GPU/TPU automatically"
    # Check TPU category
    tpu = 'local' if CFG.device=='TPU-VM' else None
    try:
        # Connect to TPU
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu=tpu) 
        # Set TPU strategy
        strategy = tf.distribute.TPUStrategy(tpu)
        print(f'> Running on {CFG.device} ', tpu.master(), end=' | ')
        print('Num of TPUs: ', strategy.num_replicas_in_sync)
        device=CFG.device
    except:
        # If TPU is not available, detect GPUs
        gpus = tf.config.list_logical_devices('GPU')
        ngpu = len(gpus)
         # Check number of GPUs
        if ngpu:
            # Set GPU strategy
            strategy = tf.distribute.MirroredStrategy(gpus) # single-GPU or multi-GPU
            # Print GPU details
            print("> Running on GPU", end=' | ')
            print("Num of GPUs: ", ngpu)
            device='GPU'
        else:
            # If no GPUs are available, use CPU
            print("> Running on CPU")
            strategy = tf.distribute.get_strategy()
            device='CPU'
    return strategy, device, tpu

In [None]:
# Initialize GPU/TPU/TPU-VM
strategy, CFG.device, tpu = get_device()
CFG.replicas = strategy.num_replicas_in_sync

# Dataset Path 📁

In [None]:
BASE_PATH = '/kaggle/input/birdclef-2023'
GCS_PATH = BASE_PATH

# Meta Data 📖
* **test_soundscapes/** - directory contains $~200$ recordings to be used for scoring when a notebook is submitted. Without submission only $1$ recording is accessible.  All recordings are $10$ minutes long and in `.ogg` audio format.
* **sample_submission.csv** - is the valid sample submission.
    * `row_id`: A slug of [soundscape_id]_[end_time] for the prediction.
    * `[bird_id]`: There are $264$ bird ID columns. The probability of the presence of each bird for each row needs to be predicted.

In [None]:
test_paths = glob('/kaggle/input/birdclef-2023/test_soundscapes/*ogg')
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df['filename'] = test_df.filepath.map(lambda x: x.split('/')[-1].replace('.ogg',''))
test_df.head()

In [None]:
tf.io.gfile.exists(test_df.filepath.iloc[0])

# Data Loader 🍚

In [None]:
def load_audio(filepath, sr=32000, normalize=True):
    audio, orig_sr = librosa.load(filepath, sr=None)
    if sr!=orig_sr:
        audio = librosa.resample(y, orig_sr, sr)
    audio = audio.astype('float32').ravel()
    audio = tf.convert_to_tensor(audio)
    return audio

@tf.function(jit_compile=True)
def MakeFrame(audio, duration=5, sr=32000):
    frame_length = int(duration * sr)
    frame_step = int(duration * sr)
    chunks = tf.signal.frame(audio, frame_length, frame_step, pad_end=True)
    return chunks

librosa.util.frame()


# EDA 🎨

## Utility

In [None]:
def display_audio(row):
    # Caption for viz
    caption = f'Id: {row.filename}'
    # Read audio file
    audio = load_audio(row.filepath)
    # Keep fixed length audio
    audio = audio[:CFG.audio_len]
    # Display audio
    print("# Audio:")
    display(ipd.Audio(audio.numpy(), rate=CFG.sample_rate))
    print('# Visualization:')
    plt.figure(figsize=(12, 3))
    plt.title(caption)
    # Waveplot
    lid.waveshow(audio.numpy(),
                 sr=CFG.sample_rate,)
                 
    plt.xlabel('');
    plt.show()

## Check

In [None]:
display_audio(test_df.iloc[0])

# Inference Configs 🔧

In [None]:
# Directory of checkpoint
CKPT_DIR = '/kaggle/input/birdclef23-pretraining-is-all-you-need-train-ds'
# Get file paths of all trained models in the directory
CKPT_PATHS = sorted([x for x in glob(f'{CKPT_DIR}/fold-*h5')])
print("Checkpoints: ", CKPT_PATHS)
# Load all the models in memory to speed up
CKPTS = [tf.keras.models.load_model(x, compile=False) for x in tqdm(CKPT_PATHS, desc="Loading ckpts ")]
# Num of ckpt to use
NUM_CKPTS = 1

# Submit or Interactive mode
SUBMIT = pd.read_csv('/kaggle/input/birdclef-2023/sample_submission.csv').shape[0] != 3

# Inference 🧪

In [None]:
# Start stopwatch
tick = time.time()

# Initialize empty list to store ids
ids = []
# Initialize empty array to store predictions
preds = np.empty(shape=(0, 264), dtype='float32')

# Iterate over each audio file in the test dataset
for filepath in tqdm(test_df.filepath.tolist(), 'test '):
    # Extract the filename without the extension
    filename = filepath.split('/')[-1].replace('.ogg','')
    
    # Load audio from file and create audio frames, each recording will be a batch input
    audio = load_audio(filepath)
    chunks = MakeFrame(audio)
    
    # Predict bird species for all frames in a recording using all trained models
    chunk_preds = np.zeros(shape=(len(chunks), 264), dtype=np.float32)
    for model in CKPTS[:NUM_CKPTS]:
        # Get the model's predictions for the current audio frames
        rec_preds = model(chunks, training=False).numpy()
        # Ensemble all prediction with average
        chunk_preds += rec_preds/len(CKPTS)
    
    # Create a ID for each frame in a recording using the filename and frame number
    rec_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(chunks))]
    
    # Concatenate the ids
    ids += rec_ids
    # Concatenate the predictions
    preds = np.concatenate([preds, chunk_preds], axis=0)
    
# Stop stopwatch
tock = time.time()

# Submission 📮

In [None]:
# Submit prediction
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, CFG.class_names] = preds
pred_df.to_csv('submission.csv',index=False)
pred_df.head()

## Check Submission

In [None]:
if not SUBMIT:
    pred_labels = pred_df[pred_df.columns[1:]].values.argmax(axis=1)
    pred_classes = list(map(lambda x: CFG.label2name[x], pred_labels))
    print(pred_classes)

# Submission Time ⏰
Estimated time to complete the submission.
> **Note**: There are nearly ~$200$ recordings on the test data.

In [None]:
sub_time = (tock-tick)*200 # ~200 recording on the test data
sub_time = time.gmtime(sub_time)
sub_time = time.strftime("%H hr: %M min : %S sec", sub_time)
print(f">> Time for submission: ~ {sub_time}")