# Import

In [1]:
# Misc
import os
import gc
import joblib
import warnings

# Data management
import pandas as pd
import numpy as np

# Sound treatments
import librosa
from scipy import signal
import resampy

# TRILL
import tensorflow as tf
import tensorflow_hub as hub
import keras.backend as K 

## Metrics
import tensorflow_addons as tfa
from tensorflow_addons.layers.netvlad import NetVLAD

# Environment

In [3]:
# Inactivate warnings
warnings.filterwarnings('ignore')

In [2]:
#DATA_PATH = '/kaggle/input/birdclef-2022/'
#WORKING_PATH = '/kaggle/working/'
#TRILL_PATH = '/kaggle/input/ziptrill/'
#VGGISH_PATH = '/kaggle/input/vggishfull/'
#MODEL_PATH = '/kaggle/input/models/'

DATA_PATH = './data/'
WORKING_PATH = './working/stacking/'
TRILL_PATH = './working/stacking/trill/'
VGGISH_PATH = './working/stacking/'
MODEL_PATH = './working/stacking/'

# Data load

In [4]:
scored_birds = ['akiapo', 'aniani', 'apapan', 'barpet', 'crehon',
                'elepai', 'ercfra', 'hawama', 'hawcre', 'hawgoo',
                'hawhaw', 'hawpet1', 'houfin', 'iiwi', 'jabwar',
                'maupar', 'omao', 'puaioh', 'skylar', 'warwhe1',
                'yefcan']

# Model

In [5]:
def create_cnn(model_name):
    if model_name == 'trill':
        model = tf.keras.models.Sequential()
        model.add(tf.keras.Input((80000,)))

        trill_layer = hub.KerasLayer(
            handle=TRILL_PATH,
            trainable=False,
            arguments={'sample_rate': int(16000)},
            output_key='embedding',
            output_shape=[None, 2048]
        )

        model.add(trill_layer)
        model.add(NetVLAD(num_clusters=8))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dense(256, activation='relu'))
        model.add(tf.keras.layers.Dense(21, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(l=1e-5)))

    elif model_name == 'efficientnetb0':
        base_model = tf.keras.applications.efficientnet.EfficientNetB0(include_top=False, input_shape=(224, 224, 3), weights=None, pooling='avg')
        dense = tf.keras.layers.Dense(142, activation='relu')(base_model.output)
        outputs = tf.keras.layers.Dense(21, activation='sigmoid')(dense)
        base_model.trainable = False
        model = tf.keras.models.Model(inputs=base_model.input, outputs=outputs)
        
    elif model_name == 'vggish':
        '''base_model, _, _ = vgk.get_embedding_model(hop_duration=0.25)   
        dense = Dense(128, activation='relu')(base_model.output)
        outputs = Dense(21, activation='sigmoid')(dense)      
        base_model.trainable = False
        model = Model(inputs=base_model.input, outputs=outputs)'''
        
        model = tf.keras.models.load_model(VGGISH_PATH + 'VGGish_full.h5')
        
    return model

# Submission

## Load models

In [6]:
Trill = create_cnn('trill')
Trill.load_weights(MODEL_PATH + 'trill.h5')
Trill.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics=[
              tfa.metrics.F1Score(name='f1macro', num_classes=21, average='macro')], run_eagerly=True)
Trill.trainable = False

In [7]:
EfficientNetB0 = create_cnn('efficientnetb0')
EfficientNetB0.load_weights(MODEL_PATH + 'EfficientNetB0.h5')
EfficientNetB0.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics=[
                       tfa.metrics.F1Score(name='f1macro', num_classes=21, average='macro')], run_eagerly=True)
EfficientNetB0.trainable = False

In [8]:
VGGish = create_cnn('vggish')
VGGish.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics=[
               tfa.metrics.F1Score(name='f1macro', num_classes=21, average='macro')], run_eagerly=True)
VGGish.trainable = False

In [9]:
meta_model = joblib.load(MODEL_PATH + 'meta_model.jl')

In [10]:
del hub
del NetVLAD
del tfa
del joblib
gc.collect()

3540

## Feature extraxction

### Trill

In [11]:
# Sound noise reduction
def f_high_trill(y, sr):
    b, a = signal.butter(10, 1000/(sr/2), btype='highpass')
    yf = signal.lfilter(b, a, y)
    return yf

In [12]:
def extractFeatures_trill(y, sr):
    # Sound noise reduction
    y = f_high_trill(y, sr)
    # Resample
    y = librosa.resample(y, sr, 16000)

    return y

### EfficientNetB0

In [13]:
class conf:
    # Preprocessing settings
    sampling_rate = 44100
    n_mels = 224
    hop_length = 494
    n_fft = n_mels * 10
    fmin = 20
    fmax = 16000
    
    # Model parameters
    num_rows = 224
    num_columns = 224
    num_channels = 3

In [14]:
def audio_to_melspectrogram(audio):
    spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

In [15]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    """
    Converts a one channel array to a 3 channel one in [0, 255]
    Arguments:
        X {numpy array [H x W]} -- 2D array to convert
    Keyword Arguments:
        eps {float} -- To avoid dividing by 0 (default: {1e-6})
        mean {None or np array} -- Mean for normalization (default: {None})
        std {None or np array} -- Std for normalization (default: {None})
    Returns:
        numpy array [3 x H x W] -- RGB numpy array
    """
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

In [16]:
def extractFeatures_EfficientNetB0(y, sr):
    # Extract features
    feat = audio_to_melspectrogram(y)
    feat = mono_to_color(feat)
    feat = feat.astype(np.uint8)
    
    # EfficientNet preprocess
    feat = tf.keras.applications.efficientnet.preprocess_input(feat)
    
    X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
    x_features = feat.tolist()
    X[0] = np.array(x_features)
        
    return X

### VGGish

In [17]:
# Sound noise reduction
def f_high_VGGish(y,sr):
    b,a = signal.butter(10, 2000/(sr/2), btype='highpass')
    yf = signal.lfilter(b,a,y)
    return yf

In [18]:
def waveform_to_examples(data, sample_rate):
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != 16000:
        data = resampy.resample(data, sample_rate, 16000)

    # Compute log mel spectrogram features.
    log_mel = log_mel_spectrogram(
        data,
        audio_sample_rate=16000,
        log_offset=0.01,
        window_length_secs=0.025,
        hop_length_secs=0.010,
        num_mel_bins=64,
        lower_edge_hertz=125,
        upper_edge_hertz=7500)

    # Frame features into examples.
    features_sample_rate = 1.0 / 0.010
    example_window_length = int(round(0.96 * features_sample_rate))
    example_hop_length = int(round(0.96 * features_sample_rate))
    log_mel_examples = frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples

In [19]:
def log_mel_spectrogram(data,
                        audio_sample_rate=8000,
                        log_offset=0.0,
                        window_length_secs=0.025,
                        hop_length_secs=0.010,
                        **kwargs):
    window_length_samples = int(round(audio_sample_rate * window_length_secs))
    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
    fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
    spectrogram = stft_magnitude(
        data,
        fft_length=fft_length,
        hop_length=hop_length_samples,
        window_length=window_length_samples)
    mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
        num_spectrogram_bins=spectrogram.shape[1],
        audio_sample_rate=audio_sample_rate, **kwargs))
    return np.log(mel_spectrogram + log_offset)

In [20]:
def frame(data, window_length, hop_length):
    num_samples = data.shape[0]
    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
    shape = (num_frames, window_length) + data.shape[1:]
    strides = (data.strides[0] * hop_length,) + data.strides
    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)

In [21]:
def stft_magnitude(signal, fft_length,
                   hop_length=None,
                   window_length=None):
    frames = frame(signal, window_length, hop_length)
    # Apply frame window to each frame. We use a periodic Hann (cosine of period
    # window_length) instead of the symmetric Hann of np.hanning (period
    # window_length-1).
    window = periodic_hann(window_length)
    windowed_frames = frames * window
    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))

In [22]:
def periodic_hann(window_length):
    return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
                               np.arange(window_length)))

In [23]:
def spectrogram_to_mel_matrix(num_mel_bins=20,
                              num_spectrogram_bins=129,
                              audio_sample_rate=8000,
                              lower_edge_hertz=125.0,
                              upper_edge_hertz=3800.0):
    nyquist_hertz = audio_sample_rate / 2.
    if lower_edge_hertz < 0.0:
        raise ValueError("lower_edge_hertz %.1f must be >= 0" %
                         lower_edge_hertz)
    if lower_edge_hertz >= upper_edge_hertz:
        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
                         (lower_edge_hertz, upper_edge_hertz))
    if upper_edge_hertz > nyquist_hertz:
        raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
                         (upper_edge_hertz, nyquist_hertz))
    spectrogram_bins_hertz = np.linspace(
        0.0, nyquist_hertz, num_spectrogram_bins)
    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
    # The i'th mel band (starting from i=1) has center frequency
    # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
    # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
    # the band_edges_mel arrays.
    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
                                 hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
    # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
    # of spectrogram values.
    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
    for i in range(num_mel_bins):
        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
        # Calculate lower and upper slopes for every spectrogram bin.
        # Line segments are linear in the *mel* domain, not hertz.
        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
                       (center_mel - lower_edge_mel))
        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
                       (upper_edge_mel - center_mel))
        # .. then intersect them with each other and zero.
        mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
                                                              upper_slope))
    # HTK excludes the spectrogram DC bin; make sure it always gets a zero
    # coefficient.
    mel_weights_matrix[0, :] = 0.0
    return mel_weights_matrix

In [24]:
def hertz_to_mel(frequencies_hertz):
    return 1127.0 * np.log(
        1.0 + (frequencies_hertz / 700.0))

In [25]:
def extractFeatures_VGGish(y, sr):
    # Sound noise reduction
    y = f_high_VGGish(y, sr)
    
    feat = waveform_to_examples(y, sr)
        
    return feat

## Process

In [26]:
# create a meta dataset
def create_meta(yhat1, yhat2, yhat3):
    # convert to dataframes
    df_new1 = pd.DataFrame.from_dict(yhat1, orient='index', columns=['tr1', 'tr2', 'tr3', 'tr4', 'tr5',
                                                                     'tr6', 'tr7', 'tr8', 'tr9', 'tr10',
                                                                     'tr11', 'tr12', 'tr13', 'tr14', 'tr15',
                                                                     'tr16', 'tr17', 'tr18', 'tr19', 'tr20',
                                                                     'tr21'])
    
    df_new2 = pd.DataFrame.from_dict(yhat2, orient='index', columns=['en1', 'en2', 'en3', 'en4', 'en5',
                                                                     'en6', 'en7', 'en8', 'en9', 'en10',
                                                                     'en11', 'en12', 'en13', 'en14', 'en15',
                                                                     'en16', 'en17', 'en18', 'en19', 'en20',
                                                                     'en21'])
    
    df_new3 = pd.DataFrame.from_dict(yhat3, orient='index', columns=['vg1', 'vg2', 'vg3', 'vg4', 'vg5',
                                                                     'vg6', 'vg7', 'vg8', 'vg9', 'vg10',
                                                                     'vg11', 'vg12', 'vg13', 'vg14', 'vg15',
                                                                     'vg16', 'vg17', 'vg18', 'vg19', 'vg20',
                                                                     'vg21'])
    # create a meta dataset
    X = pd.concat([df_new1, df_new2, df_new3], axis=1, verify_integrity=True)
    
    return X

In [27]:
test_path = DATA_PATH + '/test_soundscapes/'
files = [f.split('.')[0] for f in sorted(os.listdir(test_path))]
print('Number of test soundscapes:', len(files))

Number of test soundscapes: 1


In [28]:
del os
gc.collect()

0

In [29]:
data = []

for f in files:
    file_path = test_path + f + '.ogg'

    # Load audio file
    audio, sr = librosa.load(file_path)
    del file_path

    # Get number of samples for 5 seconds
    buffer = 5 * sr
    block_min = 5 * sr

    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    while samples_wrote < samples_total:
        # check if the buffer is not exceeding total samples
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote: (samples_wrote + buffer)]

        # check if last block is as long as previous ones
        if block.shape[0] < (block_min):
            listofzeros = np.array([0] * (block_min - block.shape[0]))
            block = np.hstack([block, listofzeros])

        # Features extraction
        trill_yhat = {}
        block_trill = extractFeatures_trill(block, sr)
        X = np.empty((1, 80000))
        X[0] = np.array(block_trill)
        tensor = tf.convert_to_tensor(X, dtype=tf.float32)
        pred_trill = Trill(tensor)
        del block_trill
        del X
        #print('pred_trill', pred_trill)
        trill_yhat[0] = pred_trill[0]
        del pred_trill

        gc.collect() 
        K.clear_session()

        EfficientNetB0_yhat = {}
        block_EfficientNetB0 = extractFeatures_EfficientNetB0(block, sr)
        X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
        X[0] = np.array(block_EfficientNetB0)
        tensor = tf.convert_to_tensor(X, dtype=tf.float32)
        pred_EfficientNetB0 = EfficientNetB0(tensor)
        del block_EfficientNetB0
        del X
        #print('pred_EfficientNetB0', pred_EfficientNetB0)
        EfficientNetB0_yhat[0] = pred_EfficientNetB0[0]
        del pred_EfficientNetB0

        gc.collect() 
        K.clear_session()

        VGGish_yhat = {}
        block_VGGish = extractFeatures_VGGish(block, sr)
        X = np.empty((1, 5, 96, 64))
        X[0] = np.array(block_VGGish)
        X = X.reshape(1, 480, 64, 1)
        tensor = tf.convert_to_tensor(X, dtype=tf.float32)
        pred_VGGish = VGGish(tensor)
        del block_VGGish
        del X
        #print('pred_VGGish', pred_VGGish)
        VGGish_yhat[0] = pred_VGGish[0]
        del pred_VGGish
        del block

        gc.collect() 
        K.clear_session()

        # construct meta dataset
        meta_X = create_meta(trill_yhat, EfficientNetB0_yhat, VGGish_yhat)
        del trill_yhat, EfficientNetB0_yhat, VGGish_yhat
        gc.collect()

        # Prediction
        pred_meta = meta_model.predict_proba(meta_X)
        del meta_X
        gc.collect()
        #print('pred_meta', pred_meta)

        index = 0
        label_indexes = []
        for score in pred_meta:
            #print(index, score[0][1])
            if score[0][1] >= 0.1:
                label_indexes.append(index)
            index += 1

        print('label_indexes', label_indexes)

        for b in scored_birds:
            segment_end = counter * 5
            row_id = f + '_' + b + '_' + str(segment_end)
            target = False
            for label_index in label_indexes:
                if scored_birds[label_index] == b:
                    target = True
            data.append([row_id, target])

        del label_indexes
        gc.collect()

        counter += 1
        samples_wrote += buffer

submission_df = pd.DataFrame(data, columns=['row_id', 'target'])
submission_df.head(21)

label_indexes [5, 11, 14, 18, 19]
label_indexes [14, 19]
label_indexes [1, 7, 13, 14, 16, 19]
label_indexes [11, 13, 14, 19]
label_indexes [7, 18, 19]
label_indexes [1, 2, 7, 13, 16, 19]
label_indexes [2, 7, 13, 14, 19]
label_indexes [5, 7, 11, 14]
label_indexes [2, 7, 13, 16, 19]
label_indexes [5, 12, 14, 18, 19]
label_indexes [2, 12, 14, 19]
label_indexes [2, 7, 13, 14, 18, 19]


Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
5,soundscape_453028782_elepai_5,True
6,soundscape_453028782_ercfra_5,False
7,soundscape_453028782_hawama_5,False
8,soundscape_453028782_hawcre_5,False
9,soundscape_453028782_hawgoo_5,False


In [30]:
submission_df.to_csv(WORKING_PATH + 'submission.csv', index=False)