# Import

In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Misc
import os
import json
import joblib
import warnings
from ipywidgets import IntProgress
from IPython.display import display

# Data management
import numpy as np
import pandas as pd

# Sound treatments
import librosa
import soundfile as sf
from scipy import signal

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical

# Class weight
from sklearn.utils.class_weight import compute_class_weight

# Model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# TRILL
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
assert tf.executing_eagerly()
import tensorflow_hub as hub

# EfficientNetB0
from keras.applications.efficientnet import EfficientNetB0
from keras.applications.efficientnet import preprocess_input

# VGGish
from vggish import vggish_input
from vggish import vggish_params as params
import vggish_keras as vgk

# Meta model
from sklearn.ensemble import RandomForestClassifier

## Metrics
import tensorflow_addons as tfa
from tensorflow_addons.layers.netvlad import NetVLAD
from sklearn.metrics import f1_score

# Plot
import matplotlib.pyplot as plt

# Environment

In [3]:
# Inactivate warnings
warnings.filterwarnings('ignore')

# Allow to display all dataframes columns
pd.set_option("display.max_columns", None)

# Display Tensorlfow version
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.config.list_physical_devices('GPU'):
    print('No GPU found. Please ensure you have installed TensorFlow correctly')
    
else:
    print('Default GPU Device: {}'.format(
        tf.config.list_physical_devices('GPU')))

    # Allow memory growth
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

TensorFlow Version: 2.6.0
Default GPU Device: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
#DATA_PATH = '/kaggle/input/birdclef-2022/'
#WORKING_PATH = '/kaggle/working/'
#MODEL_PATH = '/kaggle/input/kernel-efficientnetb0-melspec/'

DATA_PATH = './data/'
WORKING_PATH = './working/stacking/'
MODEL_PATH = './working/stacking/'

# Data generator

In [5]:
class DataGenerator_trill(Sequence):
    def __init__(self,
                 _X,
                 batch_size=32,
                 n_channels=1,
                 n_columns=470,
                 n_rows=120,
                 shuffle=True):
        self.batch_size = batch_size
        self.X = _X
        self.n_channels = n_channels
        self.n_columns = n_columns
        self.n_rows = n_rows
        self.shuffle = shuffle
        self.img_indexes = np.arange(len(self.X))
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.img_indexes) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find list of IDs
        list_IDs_temps = [self.img_indexes[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temps)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temps):
        X = np.empty((self.batch_size, 80000))
        y = np.empty((self.batch_size, 21), dtype=int)
        for i, ID in enumerate(list_IDs_temps):
            file_path = self.X.iloc[ID]['filename']

            #audio, sr = librosa.load(file_path)
            #feat = extractFeatures(audio, sr)

            feat = data_mem[file_path]

            x_features = feat.tolist()
            label = self.X.iloc[ID]['target']
            X[i] = np.array(x_features)
            y[i] = mlb.transform([label])

        return X, y

In [6]:
class DataGenerator_EfficientNetB0(Sequence):
    def __init__(self,
                 _X,
                 batch_size=32,
                 n_channels=1,
                 n_columns=470,
                 n_rows=120,
                 shuffle=True):
        self.batch_size = batch_size
        self.X = _X
        self.n_channels = n_channels
        self.n_columns = n_columns
        self.n_rows = n_rows
        self.shuffle = shuffle
        self.img_indexes = np.arange(len(self.X))
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.img_indexes) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find list of IDs
        list_IDs_temps = [self.img_indexes[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temps)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temps):
        X = np.empty((self.batch_size, self.n_rows, self.n_columns, self.n_channels))
        y = np.empty((self.batch_size, len(mlb.classes_)), dtype=int)
        for i, ID in enumerate(list_IDs_temps):
            file_path = self.X.iloc[ID]['filename']
            
            #audio, sr = librosa.load(file_path)
            #feat = extractFeatures(audio, sr)
            
            feat = data_mem[file_path]
            
            #x_features = feat.tolist()
            label = self.X.iloc[ID]['target']
            #X[i] = np.array(x_features)
            X[i] = feat
            y[i] = mlb.transform([label])
        
        return X, y

In [7]:
class DataGenerator_VGGish(Sequence):
    def __init__(self,
                 _X,
                 batch_size=32,
                 n_channels=1,
                 n_columns=470,
                 n_rows=120,
                 shuffle=True):
        self.batch_size = batch_size
        self.X = _X
        self.n_channels = n_channels
        self.n_columns = n_columns
        self.n_rows = n_rows
        self.shuffle = shuffle
        self.img_indexes = np.arange(len(self.X))
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.img_indexes) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find list of IDs
        list_IDs_temps = [self.img_indexes[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temps)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temps):
        X = np.empty((self.batch_size, self.n_rows, self.n_columns, self.n_channels))
        y = np.empty((self.batch_size, len(mlb.classes_)), dtype=int)
        for i, ID in enumerate(list_IDs_temps):
            file_path = self.X.iloc[ID]['filename']
            
            #audio, sr = librosa.load(file_path)
            #feat = extractFeatures(audio, sr)
            
            feat = data_mem[file_path]
            
            x_features = feat.tolist()
            label = self.X.iloc[ID]['target']
            X[i] = np.array(x_features)
            y[i] = mlb.transform([label])
        X = X.reshape(X.shape[0], self.n_rows, self.n_columns, self.n_channels)
        
        return X, y

In [8]:
data_mem = {}

def LoadRAM():
    # Load extracted features into RAM
    data_mem.clear()

    # Instantiate the progress bar
    max_count = data_df.shape[0]
    f = IntProgress(min=0, max=max_count)
    # Display the progress bar
    display(f)

    temp = {}

    for index, row in data_df.iterrows():
        # Increment the progress bar
        f.value += 1

        # Get file path
        file_path = row['filename']
        # Load audio file
        audio, sr = librosa.load(file_path)
        # Extracxt features
        feat = extractFeatures(audio, sr)
        
        # Store features into the dedicated dictionary
        temp[row['filename']] = feat
        
    return temp

# Data load

In [9]:
# Load meta data
train_meta = pd.read_csv(DATA_PATH + 'train_metadata.csv')

# Load scored birds
with open(DATA_PATH + 'scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)
    
# Focus on 21 scored classes
labels = list(train_meta[train_meta['primary_label'].isin(scored_birds)]['primary_label'].unique())
labels

['akiapo',
 'aniani',
 'apapan',
 'barpet',
 'crehon',
 'elepai',
 'ercfra',
 'hawama',
 'hawcre',
 'hawgoo',
 'hawhaw',
 'hawpet1',
 'houfin',
 'iiwi',
 'jabwar',
 'maupar',
 'omao',
 'puaioh',
 'skylar',
 'warwhe1',
 'yefcan']

In [10]:
data_df = pd.read_csv(WORKING_PATH + 'data_5_df.csv')

In [11]:
def to_list(df):
    temp = []
    primary_label = df['primary_label']
    
    if df['secondary_labels'] != '[]':
        secondary_labels = df['secondary_labels'].replace(
            '[', '').replace(']', '').replace("'", '').replace(' ', '').split(',')
    else:
        secondary_labels = None

    temp.append(primary_label)
    
    if secondary_labels != None:
        for item in secondary_labels:
            if item in labels:
                if item not in temp:
                    temp.append(item)
                
    return tuple(temp)

In [12]:
# Create target
data_df['target'] = data_df.apply(to_list, axis=1)
data_df.to_pickle(WORKING_PATH + 'data.pkl')

In [13]:
data_df = pd.read_pickle(WORKING_PATH + 'data.pkl')
data_df

Unnamed: 0,primary_label,secondary_labels,original_filename,filename,target
0,akiapo,"['apapan', 'hawama', 'iiwi']",akiapo/XC122399.ogg,./working/step3/each5s/split_1_akiapo_XC122399...,"(akiapo, apapan, hawama, iiwi)"
1,akiapo,"['apapan', 'hawama', 'iiwi']",akiapo/XC122399.ogg,./working/step3/each5s/split_2_akiapo_XC122399...,"(akiapo, apapan, hawama, iiwi)"
2,akiapo,"['apapan', 'hawama', 'iiwi']",akiapo/XC122399.ogg,./working/step3/each5s/split_3_akiapo_XC122399...,"(akiapo, apapan, hawama, iiwi)"
3,akiapo,"['apapan', 'hawama', 'iiwi']",akiapo/XC122399.ogg,./working/step3/each5s/split_4_akiapo_XC122399...,"(akiapo, apapan, hawama, iiwi)"
4,akiapo,"['apapan', 'hawama', 'iiwi']",akiapo/XC122399.ogg,./working/step3/each5s/split_5_akiapo_XC122399...,"(akiapo, apapan, hawama, iiwi)"
...,...,...,...,...,...
14075,yefcan,[],yefcan/XC667142.ogg,./working/step3/each5s/split_3_yefcan_XC667142...,"(yefcan,)"
14076,yefcan,[],yefcan/XC667142.ogg,./working/step3/each5s/split_4_yefcan_XC667142...,"(yefcan,)"
14077,yefcan,[],yefcan/XC667142.ogg,./working/step3/each5s/split_5_yefcan_XC667142...,"(yefcan,)"
14078,yefcan,[],yefcan/XC667142.ogg,./working/step3/each5s/split_6_yefcan_XC667142...,"(yefcan,)"


In [14]:
# Fit a MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(data_df['target'].values.tolist())

MultiLabelBinarizer()

In [15]:
mlb.classes_

array(['akiapo', 'aniani', 'apapan', 'barpet', 'crehon', 'elepai',
       'ercfra', 'hawama', 'hawcre', 'hawgoo', 'hawhaw', 'hawpet1',
       'houfin', 'iiwi', 'jabwar', 'maupar', 'omao', 'puaioh', 'skylar',
       'warwhe1', 'yefcan'], dtype=object)

# Classes weight management function

In [16]:
def class_weight(generator, mu=0.15):
    weights = {}

    labels_dict = {}
    count_class = 0
    for item in mlb.classes_:
        labels_dict[count_class] = 0

        for index, row in generator.X.iterrows():
            if item in row['target']:
                labels_dict[count_class] += 1
                
        count_class += 1

    total = sum(labels_dict.values())
    keys = labels_dict.keys()

    for i in sorted(keys):
        score = np.log(0.85*total/float(labels_dict[i]))
        weights[i] = score if score > 1 else 1

    return weights

# Ensemble

In [17]:
# Callbacks
es_callback = EarlyStopping(monitor='val_loss',
                            mode='min',
                            patience=5,
                            verbose=1,
                            restore_best_weights=True
                            )

reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.8,
                              mode='min',
                              patience=1,
                              verbose=1,
                              min_delta=0.0001,
                              cooldown=1,
                              min_lr=0.00001)

In [18]:
# Split
X_train, X_valid, _, _ = train_test_split(
    data_df, data_df['target'], test_size=0.2, random_state=42)

## Trill-distilled/3

### Preprocessing

In [19]:
# Sound noise reduction
def f_high(y,sr):
    b,a = signal.butter(10, 1000/(sr/2), btype='highpass')
    yf = signal.lfilter(b,a,y)
    return yf

In [20]:
def extractFeatures(y, sr):
    # Sound noise reduction
    y = f_high(y, sr)
    # Resample
    y = librosa.resample(y, sr, 16000)

    return y

In [21]:
params = dict(
    batch_size=32,
    n_rows=224,
    n_columns=216,
    n_channels=3,
)
params_train = dict(
    shuffle=False,
    **params
)
params_valid = dict(
    shuffle=False,
    **params
)

In [22]:
# Load data in RAM to speed up training process
data_mem = LoadRAM()

IntProgress(value=0, max=14080)

### Model

In [19]:
def get_keras_model(num_classes, input_length, use_batchnorm=True, l2=1e-5,
                    num_clusters=None, trainable=True, pooling='avg', hidden=0):
    """Make a model."""
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.Input((input_length,)))
    
    # 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/3'
    trill_layer = hub.KerasLayer(
        handle=MODEL_PATH + 'trill/',
        trainable=trainable,
        arguments={'sample_rate': int(16000)},
        output_key='embedding',
        output_shape=[None, 2048]
    )
    
    model.add(trill_layer)
    
    if num_clusters and num_clusters > 0:
        model.add(NetVLAD(num_clusters=num_clusters))
        if use_batchnorm:
            model.add(tf.keras.layers.BatchNormalization())
    else:
        if pooling == 'avg':
            # Average pooling
            model.add(tf.keras.layers.GlobalAveragePooling1D())  
        else:
            model.add(tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)))
       
    # Hidden layer
    if hidden != 0:
        model.add(tf.keras.layers.Dense(
            hidden, 
            activation='relu'))
    
    # Fully connected
    model.add(tf.keras.layers.Dense(
        num_classes, 
        activation='sigmoid',
        kernel_regularizer=tf.keras.regularizers.l2(l=l2)))

    return model

In [20]:
def create_cnn(num_clusters, use_batchnorm, pooling, hidden, fine_tune_at, model_path):
    if fine_tune_at == None:
        print('fine_tune_at == None')
        model = get_keras_model(len(labels), 
                                80000, 
                                use_batchnorm=use_batchnorm, 
                                l2=1e-5,
                                num_clusters=num_clusters, 
                                trainable=False,
                                pooling=pooling,
                                hidden=hidden
                               )

    else:
        print('model.load_weights')
        model = get_keras_model(len(labels), 
                                80000, 
                                use_batchnorm=use_batchnorm, 
                                l2=1e-5,
                                num_clusters=num_clusters, 
                                trainable=True,
                                pooling=pooling,
                                hidden=hidden
                               )

        # Load existing weights
        model.load_weights(model_path)

    print('')
    model.summary()
    print('')
    
    return model

In [26]:
# collect out of sample predictions
trill_yhat = {}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_ix, test_ix in kfold.split(X_train):
    # get data
    train_X, test_X = X_train.iloc[train_ix], X_train.iloc[test_ix]
    
    # Instanciate data generators
    train_generator = DataGenerator_trill(train_X, **params_train)
    test_generator = DataGenerator_trill(test_X, **params_train)
    
    # Class weights
    print('Class weights...')
    class_weights = class_weight(generator=train_generator, mu=0.675)
    
    # Create TRILL model
    print('Create model...')
    Trill = create_cnn(num_clusters=8, use_batchnorm=True,
                       pooling=None, hidden=256,
                       fine_tune_at=None, model_path=None)
    Trill.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss='binary_crossentropy',
                  metrics=[tfa.metrics.F1Score(name='f1macro', num_classes=len(labels), average='macro')])
    
    print('fit...')
    Trill.fit(
        train_generator,
        validation_data=test_generator,
        epochs=50,
        callbacks=[es_callback, reduce_lr],
        verbose=1,
        class_weight=class_weights
    )

    # Predict & store
    print('predict_on_batch...')
    for index, row in test_generator.X.iterrows():
        pred = Trill.predict_on_batch(data_mem[row['filename']].reshape(1, -1))
        trill_yhat[index] = pred[0]

Class weights...
Create model...
fine_tune_at == None

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, None, 2048)        51964864  
_________________________________________________________________
net_vlad_1 (NetVLAD)         (None, 16384)             32776     
_________________________________________________________________
batch_normalization_1 (Batch (None, 16384)             65536     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               4194560   
_________________________________________________________________
dense_3 (Dense)              (None, 21)                5397      
Total params: 56,263,133
Trainable params: 4,265,501
Non-trainable params: 51,997,632
_________________________________________________________________

fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoc

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 6/50

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 7/50

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 8/50
Epoch 9/50

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 13/50
Restoring model weights from the end of the best epoch.

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 00013: early stopping
predict_on_batch...
Class weights...
Create model...
fine_tune_at == None

Model: "sequential_4"
_____________________

Epoch 2/50
Epoch 3/50
Epoch 4/50

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 5/50
Epoch 6/50
Epoch 7/50

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 8/50

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 9/50
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 13/50

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 14/50
Restoring model weights from the end of the best epoch.

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 00014: early stopping
predict_on_batch...


### Save model

In [29]:
Trill.save_weights(WORKING_PATH + 'trill.h5')

In [69]:
joblib.dump(trill_yhat, WORKING_PATH + 'trill_yhat.jl')

['./working/stacking/trill_yhat.jl']

## EfficientNetB0

### Preprocessing

In [24]:
class conf:
    # Preprocessing settings
    sampling_rate = 44100
    n_mels = 224
    hop_length = 494
    n_fft = n_mels * 10
    fmin = 20
    fmax = 16000
    
    # Model parameters
    num_rows = 224
    num_columns = 224
    num_channels = 3

In [20]:
def audio_to_melspectrogram(audio):
    spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

In [21]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    """
    Converts a one channel array to a 3 channel one in [0, 255]
    Arguments:
        X {numpy array [H x W]} -- 2D array to convert
    Keyword Arguments:
        eps {float} -- To avoid dividing by 0 (default: {1e-6})
        mean {None or np array} -- Mean for normalization (default: {None})
        std {None or np array} -- Std for normalization (default: {None})
    Returns:
        numpy array [3 x H x W] -- RGB numpy array
    """
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

In [22]:
def extractFeatures(y, sr):
    # Extract features
    feat = audio_to_melspectrogram(y)
    feat = mono_to_color(feat)
    feat = feat.astype(np.uint8)
    
    # EfficientNet preprocess
    feat = preprocess_input(feat)
    
    X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
    x_features = feat.tolist()
    X[0] = np.array(x_features)
        
    return X

In [23]:
params = dict(
    batch_size=16,
    n_rows=conf.num_rows,
    n_columns=conf.num_columns,
    n_channels=conf.num_channels,
)
params_train = dict(
    shuffle=False,
    **params
)
params_valid = dict(
    shuffle=False,
    **params
)

In [24]:
# Load data in RAM to speed up training process
data_mem.clear()
data_mem = LoadRAM()

IntProgress(value=0, max=14080)

### Model

In [22]:
def create_cnn(fine_tune_at=None,
               model_path=None
               ):

    # Instanciate model
    from keras.applications.efficientnet import EfficientNetB0
    base_model = EfficientNetB0(include_top=False, input_shape=(
        conf.num_rows, conf.num_columns, conf.num_channels), weights='imagenet', pooling='avg')
    # Hidden neurons' number (input + output neurons) * 2/3 - 21
    dense = Dense(142, activation='relu')(
        base_model.output)
    outputs = Dense(len(mlb.classes_), activation='sigmoid')(dense)

    base_model.trainable = False

    if fine_tune_at == None:
        model = Model(inputs=base_model.input, outputs=outputs)

    else:
        model = Model(inputs=base_model.input, outputs=outputs)

        # Load existing weights
        model.load_weights(model_path)

        # Unfreeze model layers
        model.trainable = True

        # Freeze all the layers before the `fine_tune_at` layer
        for layer in model.layers[:fine_tune_at]:
            layer.trainable = False

    return model

In [27]:
# collect out of sample predictions
EfficientNetB0_yhat = {}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_ix, test_ix in kfold.split(X_train):
    # get data
    train_X, test_X = X_train.iloc[train_ix], X_train.iloc[test_ix]

    # Instanciate data generators
    train_generator = DataGenerator_EfficientNetB0(train_X, **params_train)
    test_generator = DataGenerator_EfficientNetB0(test_X, **params_train)
    
    # Class weights
    print('Class weights...')
    class_weights = class_weight(generator=train_generator, mu=0.675)

    # Create EfficientNetB0 model
    print('Create model...')
    EfficientNetB0 = create_cnn()
    EfficientNetB0.compile(optimizer=tf.keras.optimizers.Adam(),
                     loss='binary_crossentropy',
                     metrics=[tfa.metrics.F1Score(name='f1macro', num_classes=len(labels), average='macro')])

    print('fit...')
    EfficientNetB0.fit(
        train_generator,
        validation_data=test_generator,
        epochs=50,
        callbacks=[es_callback, reduce_lr],
        verbose=1,
        class_weight=class_weights
    )

    # Predict & store
    # Instantiate the progress bar
    max_count = test_generator.X.shape[0]
    f = IntProgress(min=0, max=max_count)
    # Display the progress bar
    display(f)

    print('predict_on_batch...')
    for index, row in test_generator.X.iterrows():
        # Increment the progress bar
        f.value += 1
        # Format data
        X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
        X[0] = np.array(data_mem[row['filename']])
        # Predict
        pred = EfficientNetB0.predict_on_batch(X)
        # Store prediction
        EfficientNetB0_yhat[index] = pred[0]

Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 9/50

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 10/50
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 18/50
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 20/50
Epoch 21/50

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 22/50
Epoch 23/50

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 24/50

Epoch 00024: ReduceLROnPlatea

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 21/50

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 22/50

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 23/50
Epoch 24/50

Epoch 00024: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 25/5

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 11/50
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 21/50

Epoch 00021: ReduceLROnPlateau reducing learn

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 15/50
Epoch 16/50
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 21/50
Epoch 22/50

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 23/50

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 24/50
Epoch 25/5

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 7/50
Epoch 8/50

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 9/50
Epoch 10/50
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 16/50
Epoch 17/50
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 20/50
Epoch 21/50
Epoch 22/50

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 23/50

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 24/50

Epoch 000

IntProgress(value=0, max=2252)

predict_on_batch...


### Save model

In [28]:
EfficientNetB0.save_weights(WORKING_PATH + 'EfficientNetB0.h5')

In [29]:
joblib.dump(EfficientNetB0_yhat, WORKING_PATH + 'EfficientNetB0_yhat.jl')

['./working/stacking/EfficientNetB0_yhat.jl']

## VGGish

### Preprocessing

In [19]:
# Sound noise reduction
def f_high(y,sr):
    b,a = signal.butter(10, 2000/(sr/2), btype='highpass')
    yf = signal.lfilter(b,a,y)
    return yf

In [20]:
def extractFeatures(y, sr):
    # Sound noise reduction
    y = f_high(y, sr)
    
    feat = vggish_input.waveform_to_examples(y, sr)
        
    return feat

In [21]:
params = dict(
    batch_size=32,
    n_rows=5,
    n_columns=96,
    n_channels=64,
)
params_train = dict(
    shuffle=True,
    **params
)
params_valid = dict(
    shuffle=False,
    **params
)

In [22]:
# Load data in RAM to speed up training process
data_mem.clear()
data_mem = LoadRAM()

IntProgress(value=0, max=14080)

### Model

In [26]:
def create_cnn(fine_tune_at=None,
               model_path=None
               ):

    # Instanciate model
    base_model, _, _ = vgk.get_embedding_model(hop_duration=0.25)   
    dense = Dense(128, activation='relu')(base_model.output)
    outputs = Dense(len(mlb.classes_), activation='sigmoid')(dense)
      
    base_model.trainable = True
    
    if fine_tune_at == None:     
        model = Model(inputs=base_model.input, outputs=outputs)

    else:
        model = Model(inputs=base_model.input, outputs=outputs)

        # Load existing weights
        model.load_weights(model_path)

        # Unfreeze model layers
        model.trainable = True

        # Freeze all the layers before the `fine_tune_at` layer
        for layer in model.layers[:fine_tune_at]:
            layer.trainable = False

    return model

In [24]:
# collect out of sample predictions
VGGish_yhat = {}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_ix, test_ix in kfold.split(X_train):
    # get data
    train_X, test_X = X_train.iloc[train_ix], X_train.iloc[test_ix]

    # Instanciate data generators
    train_generator = DataGenerator_VGGish(train_X, **params_train)
    test_generator = DataGenerator_VGGish(test_X, **params_train)

    # Class weights
    print('Class weights...')
    class_weights = class_weight(generator=train_generator, mu=0.675)

    # Create VGGish model
    print('Create model...')
    VGGish = create_cnn()
    VGGish.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss='binary_crossentropy',
                  metrics=[tfa.metrics.F1Score(name='f1macro', num_classes=len(labels), average='macro')])

    print('fit...')
    VGGish.fit(
        train_generator,
        validation_data=test_generator,
        epochs=50,
        callbacks=[es_callback, reduce_lr],
        verbose=1,
        class_weight=class_weights
    )

    # Predict & store
    # Instantiate the progress bar
    max_count = test_generator.X.shape[0]
    f = IntProgress(min=0, max=max_count)
    # Display the progress bar
    display(f)

    print('predict_on_batch...')
    for index, row in test_generator.X.iterrows():
        # Increment the progress bar
        f.value += 1
        # Format data
        X = np.empty((1, 5, 96, 64))
        X[0] = np.array(data_mem[row['filename']])
        X = X.reshape(1, 480, 64, 1)
        # Predict
        pred = VGGish.predict_on_batch(X)
        # Store prediction
        VGGish_yhat[index] = pred[0]

Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 4/50
Epoch 5/50

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 6/50
Epoch 7/50

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 8/50
Epoch 9/50
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 11/50
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 16/50
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00013421773910522462.
Epoch 19/50

Epoch 0

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 7/50
Epoch 8/50

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 9/50
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 18/50
Restoring model weights from the end of the best epoch.

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 00018: early stoppin

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 8/50
Epoch 9/50

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 10/50
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 12/50
Epoch 13/50

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 15/50
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0001342177

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 10/50
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 19/50
Restoring model weights from the end of the best epoch.

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 00019: e

IntProgress(value=0, max=2253)

predict_on_batch...
Class weights...
Create model...
fit...
Epoch 1/50
Epoch 2/50

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 18/50
Restoring model weights from the end of the best epoch.

Epoch 00018: ReduceLROnPl

IntProgress(value=0, max=2252)

predict_on_batch...


### Save model

In [25]:
VGGish.save_weights(WORKING_PATH + 'VGGish.h5')

In [26]:
joblib.dump(VGGish_yhat, WORKING_PATH + 'VGGish_yhat.jl')

['./working/stacking/VGGish_yhat.jl']

## Meta model

In [38]:
# create a meta dataset
def create_meta_dataset(data_x, yhat1, yhat2, yhat3):
    # convert to dataframes
    df_new1 = pd.DataFrame.from_dict(yhat1, orient='index', columns=['tr1', 'tr2', 'tr3', 'tr4', 'tr5',
                                                                     'tr6', 'tr7', 'tr8', 'tr9', 'tr10',
                                                                     'tr11', 'tr12', 'tr13', 'tr14', 'tr15',
                                                                     'tr16', 'tr17', 'tr18', 'tr19', 'tr20',
                                                                     'tr21'])
    
    df_new2 = pd.DataFrame.from_dict(yhat2, orient='index', columns=['en1', 'en2', 'en3', 'en4', 'en5',
                                                                     'en6', 'en7', 'en8', 'en9', 'en10',
                                                                     'en11', 'en12', 'en13', 'en14', 'en15',
                                                                     'en16', 'en17', 'en18', 'en19', 'en20',
                                                                     'en21'])
    
    df_new3 = pd.DataFrame.from_dict(yhat3, orient='index', columns=['vg1', 'vg2', 'vg3', 'vg4', 'vg5',
                                                                     'vg6', 'vg7', 'vg8', 'vg9', 'vg10',
                                                                     'vg11', 'vg12', 'vg13', 'vg14', 'vg15',
                                                                     'vg16', 'vg17', 'vg18', 'vg19', 'vg20',
                                                                     'vg21'])
    # create a meta dataset
    X = pd.concat([data_x, df_new1, df_new2, df_new3], axis=1, verify_integrity=True)
    y = mlb.transform(X['target'])

    X = X.drop(['primary_label', 'secondary_labels',
                'original_filename', 'filename', 'target'], axis=1)

    return X, y

In [28]:
# Reload yhats
trill_yhat = joblib.load(WORKING_PATH + 'trill_yhat.jl')
EfficientNetB0_yhat = joblib.load(WORKING_PATH + 'EfficientNetB0_yhat.jl')
VGGish_yhat = joblib.load(WORKING_PATH + 'VGGish_yhat.jl')

In [29]:
# construct meta dataset
meta_X_train, meta_y_train = create_meta_dataset(X_train, trill_yhat, EfficientNetB0_yhat, VGGish_yhat)

In [30]:
# construct meta classifier
meta_model = RandomForestClassifier()
meta_model.fit(meta_X_train, meta_y_train)

RandomForestClassifier()

In [31]:
# Save model
joblib.dump(meta_model, WORKING_PATH + 'meta_model.jl')

['./working/stacking/meta_model.jl']

## Evaluate

### Sub models on hold out dataset

#### Trill

In [33]:
valid_generator_trill = DataGenerator_trill(X_valid, **params_train)

In [38]:
Trill = create_cnn(num_clusters=8, use_batchnorm=True,
                   pooling=None, hidden=256,
                   fine_tune_at=None, model_path=None)
Trill.load_weights(WORKING_PATH + 'trill.h5')
Trill.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=[tfa.metrics.F1Score(name='f1macro', num_classes=len(labels), average='macro')])

fine_tune_at == None

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, None, 2048)        51964864  
_________________________________________________________________
net_vlad (NetVLAD)           (None, 16384)             32776     
_________________________________________________________________
batch_normalization (BatchNo (None, 16384)             65536     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               4194560   
_________________________________________________________________
dense_3 (Dense)              (None, 21)                5397      
Total params: 56,263,133
Trainable params: 4,265,501
Non-trainable params: 51,997,632
_________________________________________________________________



In [27]:
pred_trill = Trill.evaluate_generator(valid_generator_trill)
pred_trill

[0.034373924136161804, 0.5638612508773804]

#### EfficientNetB0

In [46]:
valid_generator_EfficientNetB0 = DataGenerator_EfficientNetB0(X_valid, **params_train)

In [48]:
EfficientNetB0 = create_cnn()
EfficientNetB0.load_weights(WORKING_PATH + 'EfficientNetB0.h5')
EfficientNetB0.compile(optimizer=tf.keras.optimizers.Adam(),
                       loss='binary_crossentropy',
                       metrics=[tfa.metrics.F1Score(name='f1macro', num_classes=len(labels), average='macro')])

In [28]:
pred_EfficientNetB0 = EfficientNetB0.evaluate_generator(valid_generator_EfficientNetB0)
pred_EfficientNetB0

[0.0552985779941082, 0.4963090121746063]

#### VGGish

In [56]:
valid_generator_VGGish = DataGenerator_VGGish(X_valid, **params_train)

In [57]:
VGGish = create_cnn()
VGGish.load_weights(WORKING_PATH + 'VGGish.h5')
VGGish.compile(optimizer=tf.keras.optimizers.Adam(),
               loss='binary_crossentropy',
               metrics=[tfa.metrics.F1Score(name='f1macro', num_classes=len(labels), average='macro')])

In [26]:
pred_VGGish = VGGish.evaluate_generator(valid_generator_VGGish)
pred_VGGish

[0.09559944272041321, 0.2858201563358307]

### Meta model on hold out dataset

In [27]:
# Reload meta_model
meta_model = joblib.load(WORKING_PATH + 'meta_model.jl')

#### Trill

In [28]:
# Re-execute first the cells corresponding to the preprocessing functions for this model
# Sound noise reduction
def f_high(y,sr):
    b,a = signal.butter(10, 1000/(sr/2), btype='highpass')
    yf = signal.lfilter(b,a,y)
    return yf

In [29]:
def extractFeatures(y, sr):
    # Sound noise reduction
    y = f_high(y, sr)
    # Resample
    y = librosa.resample(y, sr, 16000)

    return y

In [30]:
params = dict(
    batch_size=32,
    n_rows=224,
    n_columns=216,
    n_channels=3,
)
params_train = dict(
    shuffle=False,
    **params
)
params_valid = dict(
    shuffle=False,
    **params
)

In [31]:
# Load data in RAM to speed up training process
data_mem.clear()
data_mem = LoadRAM()

IntProgress(value=0, max=14080)

In [39]:
meta_pred_trill = {}

for index, row in valid_generator_trill.X.iterrows():
    pred = Trill.predict_on_batch(data_mem[row['filename']].reshape(1, -1))
    meta_pred_trill[index] = pred[0]

#### EfficientNetB0

In [40]:
# Re-execute first the cells corresponding to the preprocessing functions for this model
class conf:
    # Preprocessing settings
    sampling_rate = 44100
    n_mels = 224
    hop_length = 494
    n_fft = n_mels * 10
    fmin = 20
    fmax = 16000
    
    # Model parameters
    num_rows = 224
    num_columns = 224
    num_channels = 3

In [41]:
def audio_to_melspectrogram(audio):
    spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

In [42]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    """
    Converts a one channel array to a 3 channel one in [0, 255]
    Arguments:
        X {numpy array [H x W]} -- 2D array to convert
    Keyword Arguments:
        eps {float} -- To avoid dividing by 0 (default: {1e-6})
        mean {None or np array} -- Mean for normalization (default: {None})
        std {None or np array} -- Std for normalization (default: {None})
    Returns:
        numpy array [3 x H x W] -- RGB numpy array
    """
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

In [43]:
def extractFeatures(y, sr):
    # Extract features
    feat = audio_to_melspectrogram(y)
    feat = mono_to_color(feat)
    feat = feat.astype(np.uint8)
    
    # EfficientNet preprocess
    feat = preprocess_input(feat)
    
    X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
    x_features = feat.tolist()
    X[0] = np.array(x_features)
        
    return X

In [44]:
params = dict(
    batch_size=16,
    n_rows=conf.num_rows,
    n_columns=conf.num_columns,
    n_channels=conf.num_channels,
)
params_train = dict(
    shuffle=False,
    **params
)
params_valid = dict(
    shuffle=False,
    **params
)

In [45]:
# Load data in RAM to speed up training process
data_mem.clear()
data_mem = LoadRAM()

IntProgress(value=0, max=14080)

In [50]:
meta_pred_EfficientNetB0 = {}

for index, row in valid_generator_EfficientNetB0.X.iterrows():
    # Format data
    X = np.empty((1, conf.num_rows, conf.num_columns, conf.num_channels))
    X[0] = np.array(data_mem[row['filename']])
    # Predict
    pred = EfficientNetB0.predict_on_batch(X)
    # Store prediction
    meta_pred_EfficientNetB0[index] = pred[0]

#### VGGish

In [51]:
# Re-execute first the cells corresponding to the preprocessing functions for this model
# Sound noise reduction
def f_high(y,sr):
    b,a = signal.butter(10, 2000/(sr/2), btype='highpass')
    yf = signal.lfilter(b,a,y)
    return yf

In [52]:
def extractFeatures(y, sr):
    # Sound noise reduction
    y = f_high(y, sr)
    
    feat = vggish_input.waveform_to_examples(y, sr)
        
    return feat

In [53]:
params = dict(
    batch_size=32,
    n_rows=5,
    n_columns=96,
    n_channels=64,
)
params_train = dict(
    shuffle=True,
    **params
)
params_valid = dict(
    shuffle=False,
    **params
)

In [54]:
# Load data in RAM to speed up training process
data_mem.clear()
data_mem = LoadRAM()

IntProgress(value=0, max=14080)

In [58]:
meta_pred_VGGish = {}

for index, row in valid_generator_VGGish.X.iterrows():
    # Format data
    X = np.empty((1, 5, 96, 64))
    X[0] = np.array(data_mem[row['filename']])
    X = X.reshape(1, 480, 64, 1)
    # Predict
    pred = VGGish.predict_on_batch(X)
    # Store prediction
    meta_pred_VGGish[index] = pred[0]

#### Save point

In [59]:
joblib.dump(meta_pred_trill, WORKING_PATH + 'meta_pred_trill.jl')
joblib.dump(meta_pred_EfficientNetB0, WORKING_PATH + 'meta_pred_EfficientNetB0.jl')
joblib.dump(meta_pred_VGGish, WORKING_PATH + 'meta_pred_VGGish.jl')

['./working/stacking/meta_pred_VGGish.jl']

#### Evaluation

In [62]:
# construct meta dataset
meta_X_valid, meta_y_valid = create_meta_dataset(X_valid, meta_pred_trill, meta_pred_EfficientNetB0, meta_pred_VGGish)

In [63]:
pred_meta = meta_model.predict(meta_X_valid)

In [64]:
print(f1_score(meta_y_valid, pred_meta, average='macro'))

0.7392820767883205
