# BirdCLEF 2024 [Inference]

## Features
- PyTorch's Dataset & Dataloader
- Use PyTorch-Lightning for building model
- Data slice is based on @MARK WIJKHUIZEN's [notebook](https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-efficientvit-inference).

# Import Packages

In [1]:
%%capture
# !pip install torch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0
# !pip install pytorch_lightning==2.1
# !pip install pandas librosa opencv-python matplotlib  #cupy-cuda110 
# !pip install -U albumentations

In [2]:
import time

s = time.time()

In [3]:
import re
import os
import gc
import sys
import cv2
import math
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import librosa
from scipy import signal as sci_signal
from tqdm import tqdm
import torch
from torch import nn
from torchvision.models import efficientnet

import albumentations as albu

import pytorch_lightning as pl
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

# Configuration

In [4]:
class CONFIG:
    
    # == GENERAL ==
    seed = 42                           # random seed
    device = 'cpu'                         # device to be used
    
    # == DATA ==
    # preprocessed_data = '../../preprocessed_data/imgs_v0/'                  # Path for processed data to be stores (Must put on .gitignore to not send to repo)
    checkpoint_dir = '../chpks/effnet_3fold_rgb'   # Checkpoints path (Must put on .gitignore to not send to repo)
    data_dir_2024 = '../../data/2024'# root folder
    sr = 32000                              # sampling rate
    n_fft = 1095                            # NFFT of Spec.
    win_len = 412                           # WIN_SIZE of Spec.
    hop_len = 100                           # overlap of Spec.
    min_freq = 40                           # min frequency
    max_freq = 15000                        # max frequency
    n_mels = 128
    
    # == MODEL ==
    model = 'efficientnet_b0'               # model architecture
    
    # == DATASET ==
    batch_size = 64                         # batch size of each step
    n_workers = 4                           # number of workers

print('fix seed')
pl.seed_everything(CONFIG.seed, workers=True)

Seed set to 42


fix seed


42

In [5]:
# labels
label_list = sorted(os.listdir(os.path.join(CONFIG.data_dir_2024, 'train_audio')))
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
id2label = dict(zip(label_id_list, label_list))

# Dataset & Dataloader

## Pre-Processing

In [6]:
def opposite_melspectrogram(data):
    n_fft = CONFIG.n_fft
    hop_length = CONFIG.hop_len
    rate = CONFIG.sr
    n_mels = 128

    mel_frequencies = librosa.mel_frequencies(n_mels=128)
    
    # Inverter as janelas mel para que as frequências mais altas tenham janelas mais curtas
    mel_window = librosa.filters.mel(sr= rate, n_fft = n_fft, n_mels=n_mels, htk=True)
    
    # Inverter a ordem das janelas
    mel_window = mel_window[:, ::-1]
    
    # Calcular o espectrograma mel com a escala mel customizada
    S = np.dot(mel_window, np.log(np.abs(librosa.stft(data, n_fft=n_fft, hop_length=hop_length))**2 + 1e-20))

    spec_data = librosa.amplitude_to_db(S, ref=np.max)

    return spec_data

def oog2spec_via_scipy(audio_data):
    # HANDLE NaNs
    mean_signal = np.nanmean(audio_data)
    input_audio = np.nan_to_num(audio_data, nan=mean_signal) if np.isnan(audio_data).mean() < 1 else np.zeros_like(input_audio)
    
    # SPECTROGRAM
    frequencies, times, spec_data = sci_signal.spectrogram(
        audio_data, 
        fs=CONFIG.sr, 
        nfft=CONFIG.n_fft, 
        nperseg=CONFIG.win_len, 
        noverlap=CONFIG.hop_len, 
        window='hann'
    )
    
    
    # Inverter as janelas mel para que as frequências mais altas tenham janelas mais curtas
    mel_window = librosa.filters.mel(sr= CONFIG.sr, n_fft = CONFIG.n_fft, n_mels=CONFIG.n_mels, htk=True, fmin= CONFIG.min_freq, fmax = CONFIG.max_freq)
    
    # Calcular o espectrograma mel com a escala mel customizada
    melspec = np.dot(mel_window, np.log10(np.abs(spec_data) + 1e-20))
    # melspec = librosa.amplitude_to_db(melspec, ref=np.max)
    
    # Inverter a ordem das janelas
    mel_window = mel_window[:, ::-1]
    
    # Calcular o espectrograma mel com a escala mel customizada
    oposite_melspec = np.dot(mel_window, np.log10(np.abs(spec_data) + 1e-20))
    # oposite_melspec = librosa.amplitude_to_db(oposite_melspec, ref=np.max)
    
    # FILTER LOWER AND HIGHER FREQUENCIES
    valid_freq = (frequencies >= CONFIG.min_freq) & (frequencies <= CONFIG.max_freq)
    spec_data = spec_data[valid_freq, :]
    
    # COMPUTE LOG SPEC
    spec_data = np.log10(spec_data + 1e-20)

    
    # MIN/MAX NORMALIZATION
    spec_data = spec_data - spec_data.min()
    spec_data = spec_data / spec_data.max()

    # MIN/MAX NORMALIZATION
    melspec = melspec - melspec.min()
    melspec = melspec / melspec.max()

    # MIN/MAX NORMALIZATION
    oposite_melspec = oposite_melspec - oposite_melspec.min()
    oposite_melspec = oposite_melspec / oposite_melspec.max()
    # SPEC TO IMAGE
    # R = cv2.resize(R, (256, 256), interpolation=cv2.INTER_AREA)
    # G = cv2.resize(G, (256, 256), interpolation=cv2.INTER_AREA)
    # B = cv2.resize(B, (256, 256), interpolation=cv2.INTER_AREA)
    
    # return np.array([R,G,B]).transpose(1,2,0) # (256,256,3)
    return spec_data, melspec, oposite_melspec

In [7]:
file_path = '../../data/2024/unlabeled_soundscapes/1000170626.ogg'
audio_data, _ = librosa.load(file_path, sr=CONFIG.sr)
# HANDLE NaNs
spec_data, melspec, oposite_melspec = oog2spec_via_scipy(audio_data)

In [8]:
melspec.shape, melspec.min(), melspec.max()

((128, 24615), 0.0, 1.0)

In [9]:
oposite_melspec.shape, oposite_melspec.min(), oposite_melspec.max()

((128, 24615), 0.0, 1.0)

In [10]:
spec_data.shape, spec_data.min(), spec_data.max()

((512, 24615), 0.0, 1.0)

In [11]:
from functools import partial
from joblib import Parallel, delayed

In [12]:
_ = Parallel(n_jobs=os.cpu_count())(delayed(lambda x: x)(i) for i in range(10))


In [13]:
# all_bird_data = dict()

# # https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-efficientvit-inference
# if len(glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')) > 0:
#     ogg_file_paths = glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')
# else:
#     ogg_file_paths = sorted(glob(f'{CONFIG.data_dir_2024}/unlabeled_soundscapes/*.ogg'))[:10]

# for i, file_path in tqdm(enumerate(ogg_file_paths)):

#     row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1)  # filename

#     audio_data, _ = librosa.load(file_path, sr=CONFIG.sr)

#     for i in range(48):
#         input_data = audio_data[5*i*CONFIG.sr:5*(i+1)*CONFIG.sr]
#         spec = oog2spec_via_scipy(input_data)
#         # print(spec.shape)
#         all_bird_data[f'{row_id}_{(i+1)*5}'] = spec
#         # break
#     # print(R.shape, G.shape, B.shape)
    
#     # pad
#     # pad = 512 - (R.shape[1] % 512)
#     # if pad > 0:
#     #     R = np.pad(R, ((0,0), (0,pad)))
#     #     G = np.pad(G, ((0,0), (0,pad)))
#     #     B = np.pad(B, ((0,0), (0,pad)))
#     #     # print(spec.shape)
#     # # reshape
#     # R = R.reshape(512,-1,512).transpose([0, 2, 1])
#     # G = G.reshape(512,-1,512).transpose([0, 2, 1])
#     # B = B.reshape(512,-1,512).transpose([0, 2, 1])
    
    
#     # # print(spec.shape)
#     # # spec = cv2.resize(spec, (256, 256), interpolation=cv2.INTER_AREA)
#     # R = cv2.resize(R, (256, 256), interpolation=cv2.INTER_AREA)
#     # G = cv2.resize(G, (256, 256), interpolation=cv2.INTER_AREA)
#     # B = cv2.resize(B, (256, 256), interpolation=cv2.INTER_AREA)

#     # print(G.shape)
#     # # spec = np.array([R,G,B]).transpose(1,2,0) # (256,256,3)

#     # break
#     # print(spec.shape)
#     # for j in range(48):
#         # all_bird_data[f'{row_id}_{(j+1)*5}'] = spec[:, :, j]

In [14]:
def get_batched_specs(file_path):

    audio_data, _ = librosa.load(file_path, sr=CONFIG.sr)
    row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1) 

    R, G ,B = oog2spec_via_scipy(audio_data)
    # print(R.shape, G.shape, B.shape)
    # pad
    pad = 512 - (R.shape[1] % 512)
    if pad > 0:
        R = np.pad(R, ((0,0), (0,pad)))
    pad = 1600 - (G.shape[1] % 1600)
    if pad > 0:
        G = np.pad(G, ((0,0), (0,pad)))
        B = np.pad(B, ((0,0), (0,pad)))
    #     # print(spec.shape)

    # print(R.shape, G.shape, B.shape)
    # reshape
    R = R.reshape(512,-1,512).transpose([0, 2, 1])
    G = G.reshape(128,-1,1600).transpose([0, 2, 1])
    B = B.reshape(128,-1,1600).transpose([0, 2, 1])
    
    # print(R.shape, G.shape, B.shape)
    # print(spec.shape)
    # spec = cv2.resize(spec, (256, 256), interpolation=cv2.INTER_AREA)
    R = cv2.resize(R, (256, 256), interpolation=cv2.INTER_AREA)
    G = cv2.resize(G, (256, 256), interpolation=cv2.INTER_AREA)
    B = cv2.resize(B, (256, 256), interpolation=cv2.INTER_AREA)

    # print(R.shape, G.shape, B.shape)
    spec = np.array([R,G,B]).transpose(1,2,0, 3) # (256,256,3)


    for j in range(48):
        all_bird_data[f'{row_id}_{(j+1)*5}'] = spec[:, :, :, j]
    
    return all_bird_data

In [15]:
# import time

# start = time.time()
# all_bird_data_ = dict()

# # https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-efficientvit-inference
# if len(glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')) > 0:
#     ogg_file_paths = glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')
# else:
#     ogg_file_paths = sorted(glob(f'{CONFIG.data_dir_2024}/unlabeled_soundscapes/*.ogg'))[:10]

# for i, file_path in tqdm(enumerate(ogg_file_paths)):
#     all_bird_data_ = get_batched_specs(file_path, all_bird_data_)

# end = time.time()

# print(f"It took {end-start}s to run")

# New One

In [16]:
class EffNet(nn.Module):
    
    def __init__(self, model_type, n_classes, pretrained=False):
        super().__init__()
        
        if model_type == 'efficientnet_b0':
            if pretrained: weights = efficientnet.EfficientNet_B0_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b0(weights=weights)
        elif model_type == 'efficientnet_b1':
            if pretrained: weights = efficientnet.EfficientNet_B1_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b1(weights=weights)
        elif model_type == 'efficientnet_b2':
            if pretrained: weights = efficientnet.EfficientNet_B2_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b2(weights=weights)
        elif model_type == 'efficientnet_b3':
            if pretrained: weights = efficientnet.EfficientNet_B3_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b3(weights=weights)
        else:
            raise ValueError('model type not supported')
        
        self.base_model.classifier[1] = nn.Linear(self.base_model.classifier[1].in_features, n_classes, dtype=torch.float32)
    
    def forward(self, x):
        x = x.permute(0,3,1,2)
        return self.base_model(x)

class BirdModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        
        # == backbone ==
        self.backbone = EffNet(CONFIG.model, n_classes=len(label_list))
        
        # == loss function ==
        self.loss_fn = nn.CrossEntropyLoss()
        
        # == record ==
        self.validation_step_outputs = []
        
    def forward(self, images):
        return self.backbone(images)
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=CONFIG.lr,
            weight_decay=CONFIG.weight_decay
        )
        
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=CONFIG.epochs,
            T_mult=1,
            eta_min=1e-6,
            last_epoch=-1
        )
        
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss',
                'frequency': 1
            }
        }
    
    def training_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch
        image = image.to(self.device)
        target = target.to(self.device)
        
        # == pred ==
        y_pred = self(image)
        
        # == compute loss ==
        train_loss = self.loss_fn(y_pred, target)
        
        # == record ==
        self.log('train_loss', train_loss, True)
        
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target = batch
        image = image.to(self.device)
        target = target.to(self.device)
        
        # == pred ==
        with torch.no_grad():
            y_pred = self(image)
            
        self.validation_step_outputs.append({"logits": y_pred, "targets": target})
        
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def on_validation_epoch_end(self):
        
        # = merge batch data =
        outputs = self.validation_step_outputs
        
        output_val = nn.Softmax(dim=1)(torch.cat([x['logits'] for x in outputs], dim=0)).cpu().detach()
        target_val = torch.cat([x['targets'] for x in outputs], dim=0).cpu().detach()
        
        # = compute validation loss =
        val_loss = self.loss_fn(output_val, target_val)
        
        # target to one-hot
        target_val = torch.nn.functional.one_hot(target_val, len(label_list))
        
        # = val with ROC AUC =
        gt_df = pd.DataFrame(target_val.numpy().astype(np.float32), columns=label_list)
        pred_df = pd.DataFrame(output_val.numpy().astype(np.float32), columns=label_list)
        
        gt_df['id'] = [f'id_{i}' for i in range(len(gt_df))]
        pred_df['id'] = [f'id_{i}' for i in range(len(pred_df))]
        
        val_score = score(gt_df, pred_df, row_id_column_name='id')
        
        self.log("val_score", val_score, True)
        
        return {'val_loss': val_loss, 'val_score': val_score}

def predict(spec, models):

    spec = torch.tensor(spec, dtype=torch.float32)

    pred = []
    for model in models:
        with torch.no_grad():
            outputs = model(spec.permute(3,0,1,2))
            outputs = nn.Softmax(dim=1)(outputs)
        pred.append(outputs.detach().cpu().numpy())
    
    # pred = torch.cat(pred, dim=0).cpu().detach()
    gc.collect()
    # print(outputs.shape, len(pred), len(np.mean(pred, axis =0)))
    return np.mean(pred, axis = 0)

ckpt_list = glob(f'{CONFIG.checkpoint_dir}/*.ckpt')
print(f'find {len(ckpt_list)} ckpts in {CONFIG.checkpoint_dir}.')
ckpt_list = [ckpt_list[1]]
predictions = []

models = []
for ckpt in ckpt_list:
    
    # == init model ==
    bird_model = BirdModel()
    
    # == load ckpt ==
    weights = torch.load(ckpt, map_location=torch.device('cpu'))['state_dict']
    bird_model.load_state_dict(weights)

    bird_model.to(CONFIG.device)
    bird_model.eval()
    models.append(bird_model)
    gc.collect()

# predictions = np.mean(predictions, axis=0)

# sub_pred = pd.DataFrame(predictions, columns=label_list)
# sub_id = pd.DataFrame({'row_id': list(all_bird_data.keys())})

# sub = pd.concat([sub_id, sub_pred], axis=1)

# sub.to_csv('submission.csv',index=False)
# print(f'Submissionn shape: {sub.shape}')
# sub.head(5)

find 3 ckpts in ../chpks/effnet_3fold_rgb.


In [17]:
# preds = predict(all_bird_data_[0]['1001358022_5'], models)

In [18]:
# preds.shape

In [19]:
def partial_predict(file_path, birds_dict_preds, models):

    ## GET BATCHED SPECS
    audio_data, _ = librosa.load(file_path, sr=CONFIG.sr)
    row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1) 

    R, G ,B = oog2spec_via_scipy(audio_data)
    # print(R.shape, G.shape, B.shape)
    # pad
    pad = 512 - (R.shape[1] % 512)
    if pad > 0:
        R = np.pad(R, ((0,0), (0,pad)))
    # pad = 512 - (G.shape[1] % 1600)
    # if pad > 0:
        G = np.pad(G, ((0,0), (0,pad)))
        B = np.pad(B, ((0,0), (0,pad)))
    #     # print(spec.shape)

    # print(R.shape, G.shape, B.shape)
    # reshape
    R = R.reshape(512,-1,512).transpose([0, 2, 1])
    G = G.reshape(128,-1,512).transpose([0, 2, 1])
    B = B.reshape(128,-1,512).transpose([0, 2, 1])
    
    # print(R.shape, G.shape, B.shape)
    # print(spec.shape)
    # spec = cv2.resize(spec, (256, 256), interpolation=cv2.INTER_AREA)
    R = cv2.resize(R, (256, 256), interpolation=cv2.INTER_AREA)
    G = cv2.resize(G, (256, 256), interpolation=cv2.INTER_AREA)
    B = cv2.resize(B, (256, 256), interpolation=cv2.INTER_AREA)

    # print(R)
    # print(R.shape, G.shape, B.shape)
    spec = np.array([R,G,B]).transpose(1,2,0,3) # (256,256,3)

    preds = predict(spec, models)

    # print(preds.shape)
    for j in range(48):
        # print('starting preds')
        birds_dict_preds[f'{row_id}_{(j+1)*5}'] = preds[j]


    time.sleep(1)
    
    return birds_dict_preds

In [20]:
file_path

'../../data/2024/unlabeled_soundscapes/1000170626.ogg'

In [21]:
birds_dict_preds = {}
birds_dict_preds = partial_predict(file_path, birds_dict_preds, models)

In [22]:
# Getting file order
def get_key_names(file_path):
    names = []
    row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1)
    for j in range(48):
        names.append(f'{row_id}_{(j+1)*5}')
    return names

In [23]:
# get_key_names(file_path)[2]


if len(glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')) > 0:
    ogg_file_paths = glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')
else:
    ogg_file_paths = sorted(glob(f'{CONFIG.data_dir_2024}/unlabeled_soundscapes/*.ogg'))[:10]

indices = []

for file_path in ogg_file_paths:
    indices.extend(get_key_names(file_path))

In [24]:
from concurrent.futures import ThreadPoolExecutor
import time

# Função que será executada em paralelo
def minha_funcao(arg):
    time.sleep(1)  # Simula uma operação demorada
    return arg * 2

# Lista de argumentos para a função
argumentos = [1, 2, 3, 4, 5]

# Criar um ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    # Mapear a função em cada argumento
    resultados = list(executor.map(minha_funcao, argumentos))

    # Aguardar até que todas as tarefas pendentes sejam concluídas
    executor.shutdown(wait=True)

print("Resultados:", resultados)

Resultados: [2, 4, 6, 8, 10]


In [25]:
import time
import threading
start = time.time()
birds_dict_preds = dict()

if len(glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')) > 0:
    ogg_file_paths = glob(f'{CONFIG.data_dir_2024}/test_soundscapes/*.ogg')
else:
    ogg_file_paths = sorted(glob(f'{CONFIG.data_dir_2024}/unlabeled_soundscapes/*.ogg'))[:10]


_convert = partial(
    partial_predict,
    birds_dict_preds=birds_dict_preds,
    models = models
)


# print('iniciando paralel')
birds_dict_preds = Parallel(n_jobs=2, timeout=10)(delayed(_convert)(file_path) for file_path in tqdm(ogg_file_paths))
# print('acabou paralel')
# for i, file_path in tqdm(enumerate(ogg_file_paths)):
    # all_bird_data_ = get_batched_specs(file_path, all_bird_data)

indices = []

for file_path in ogg_file_paths:
    indices.extend(get_key_names(file_path))

dicionario_final = {}

for dicionario in birds_dict_preds:
    dicionario_final.update(dicionario)


predictions = []
for idx in indices:
    predictions.append(dicionario_final[idx])


sub_pred = pd.DataFrame(predictions, columns=label_list)
sub_id = pd.DataFrame({'row_id': indices})

sub = pd.concat([sub_id, sub_pred], axis=1)

sub.to_csv('submission.csv',index=False)
print(f'Submissionn shape: {sub.shape}')


end = time.time()


print(f"It took {end-start}s to run")

  0%|                                                    | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
sub_time = (end - start) *  110 # Calculate estimated submission time for ~1100 recordings
sub_time = time.gmtime(sub_time)  # Convert seconds to a time tuple
sub_time = time.strftime("%H hr: %M min : %S sec", sub_time)  # Format time tuple as string
print(f">> Time for submission: ~ {sub_time}")  # Print estimated submission time

In [None]:
sub.head(5)

In [None]:
len(ogg_file_paths)

In [None]:
e = time.time()

print(f'Notebook runtime: {e-s}')

sub_time = (e - s) * 110  # Calculate estimated submission time for ~1100 recordings
sub_time = time.gmtime(sub_time)  # Convert seconds to a time tuple
sub_time = time.strftime("%H hr: %M min : %S sec", sub_time)  # Format time tuple as string
print(f">> Time for submission: ~ {sub_time}")  # Print estimated submission time