# Import

In [1]:
# Misc
import os
import json
import warnings

# Data management
import numpy as np
import pandas as pd

# Sound treatments
import librosa
import soundfile as sf

# Model
import tensorflow as tf

from keras.applications.efficientnet import EfficientNetB0
from keras.applications.efficientnet import preprocess_input as penb0

# Metrics
import tensorflow_addons as tfa

# Environment

In [2]:
# Inactivate warnings
warnings.filterwarnings('ignore')

In [3]:
#DATA_PATH = '/kaggle/input/birdclef-2022/'
#KERNEL_PATH = '/kaggle/working/'
#MODEL_PATH = '/kaggle/input/kernel-efficientnetb0-melspec-multilabel/'

DATA_PATH = './data/'
WORKING_PATH = './working/kernel/'
MODEL_PATH = './working/kernel/'

# Data load

In [4]:
# Load meta data
train_meta = pd.read_csv(DATA_PATH + 'train_metadata.csv')

# Load scored birds
with open(DATA_PATH + 'scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)

# Focus on 21 scored classes
labels = list(train_meta[train_meta['primary_label'].isin(
    scored_birds)]['primary_label'].unique())
labels

['akiapo',
 'aniani',
 'apapan',
 'barpet',
 'crehon',
 'elepai',
 'ercfra',
 'hawama',
 'hawcre',
 'hawgoo',
 'hawhaw',
 'hawpet1',
 'houfin',
 'iiwi',
 'jabwar',
 'maupar',
 'omao',
 'puaioh',
 'skylar',
 'warwhe1',
 'yefcan']

# Model

In [5]:
def create_cnn():

    # Instanciate model
    base_model = EfficientNetB0(include_top=False, input_shape=(
            conf.num_rows, conf.num_columns, conf.num_channels), weights=None, pooling='avg')
    dense = tf.keras.layers.Dense(142, activation='relu')(base_model.output)
    outputs = tf.keras.layers.Dense(len(labels), activation='sigmoid')(dense)

    base_model.trainable = False

    model = tf.keras.models.Model(inputs=base_model.input, outputs=outputs)

    return model

# Configuration

In [6]:
class conf:
    # Preprocessing settings
    sampling_rate = 44100
    n_mels = 224
    hop_length = 494
    n_fft = n_mels * 10
    fmin = 20
    fmax = 16000

    # Model parameters
    num_rows = 224
    num_columns = 224
    num_channels = 3

# Preprocessing

In [7]:
def audio_to_melspectrogram(audio):
    spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

In [8]:
def extractFeatures(y, sr):
    # Extract features
    feat = audio_to_melspectrogram(y)
    
    # Feed columns with 0 if number of columns < conf.num_columns
    if feat.shape[1] <= conf.num_columns:
        pad_width = conf.num_columns - feat.shape[1]
        feat = np.pad(feat, pad_width=(
            (0, 0), (0, pad_width)), mode='constant')
        
    # 3 channels
    feat = np.stack([feat, feat, feat], axis=-1)
    
    # preprocess
    feat = penb0(feat)
    
    X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
    x_features = feat.tolist()
    X[0] = np.array(x_features)

    return X

# Submission

## Load model

In [9]:
pred_model = create_cnn()
pred_model.load_weights(MODEL_PATH + 'finetune_ms_da_EfficientNetB0.h5')

## Check model

## Process

In [10]:
test_path = DATA_PATH + '/test_soundscapes/'
files = [f.split('.')[0] for f in sorted(os.listdir(test_path))]
print('Number of test soundscapes:', len(files))

Number of test soundscapes: 1


In [11]:
Class_weights = {0: 3.1419637033987953, 1: 3.669718389265234, 2: 1.063421846889857, 3: 3.079225362765372,
                 4: 5.243224292473271, 5: 2.9406391994792256, 6: 5.51148827906795, 7: 1.684520523136885,
                 8: 2.173804303128121, 9: 4.338768018246119, 10: 4.362865569825179, 11: 4.985395183171171,
                 12: 1, 13: 1.2640925423520082, 14: 1.5514079786860249, 15: 4.550077111913326, 16: 1.9301083788605897,
                 17: 5.879213059193268, 18: 1, 19: 1.3671332962711884, 20: 2.218504482046028}

In [12]:
def testscore(prediction):   
    result = []

    for i in range(0, len(prediction)):
        if Class_weights[i] < 2:
            if prediction[i] >= 1e-2:
                result.append(prediction[i])
        elif Class_weights[i] < 3:
            if prediction[i] >= 1e-4:
                result.append(prediction[i])
        elif Class_weights[i] < 4:
            if prediction[i] >= 1e-8:
                result.append(prediction[i])
        elif Class_weights[i] < 5:
            if prediction[i] >= 1e-12:
                result.append(prediction[i])
        elif Class_weights[i] > 5:
            if prediction[i] >= 1e-16:
                result.append(prediction[i])
                
    return result

In [13]:
data = []

for f in files:
    file_path = test_path + f + '.ogg'

    # Load audio file
    audio, sr = librosa.load(file_path)

    # Get number of samples for 5 seconds
    buffer = 5 * sr
    block_min = 5 * sr

    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    while samples_wrote < samples_total:
        # check if the buffer is not exceeding total samples
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote: (samples_wrote + buffer)]

        # check if last block is as long as previous ones
        if block.shape[0] < (block_min):
            listofzeros = np.array([0] * (block_min - block.shape[0]))
            block = np.hstack([block, listofzeros])

        # Features extraction
        block = extractFeatures(block, sr)

        # Prediction
        pred = pred_model.predict(block)
        #print('pred', pred)

        #countOK = list(filter(lambda score: score >= 1e-3, pred[0]))
        countOK = testscore(pred[0])
        #print('countOK', countOK)

        label_indexes = []
        for i in range(0, len(countOK)):
            label_indexes.append(np.argsort(np.max(pred, axis=0))[-(i+1)])

        print(label_indexes)

        for b in scored_birds:
            segment_end = counter * 5
            row_id = f + '_' + b + '_' + str(segment_end)
            target = False
            for label_index in label_indexes:
                if labels[label_index] == b:
                    target = True
            data.append([row_id, target])
        counter += 1
        samples_wrote += buffer

submission_df = pd.DataFrame(data, columns=['row_id', 'target'])
submission_df.head(21)

[18, 12, 14, 11, 20, 16]
[12, 14, 18, 11]
[18, 12, 14, 16, 1, 2, 19, 11]
[18, 12, 14, 19, 11]
[12, 18, 14, 11, 20, 16]
[18, 12, 14]
[18, 12, 14, 11]
[12, 18, 14, 16, 2, 1]
[12, 18, 14, 11]
[12, 18, 14, 11, 16]
[12, 18, 14, 19, 16]
[18, 12, 11, 19]


Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
5,soundscape_453028782_elepai_5,False
6,soundscape_453028782_ercfra_5,False
7,soundscape_453028782_hawama_5,False
8,soundscape_453028782_hawcre_5,False
9,soundscape_453028782_hawgoo_5,False


In [14]:
submission_df.to_csv(WORKING_PATH + 'submission.csv', index=False)