# Import

In [1]:
# Misc
import os
import json
import warnings

# Data management
import numpy as np
import pandas as pd

# Sound treatments
import librosa
import soundfile as sf
import resampy

# Model
import tensorflow as tf

# Metrics
import tensorflow_addons as tfa

# Environment

In [2]:
# Inactivate warnings
warnings.filterwarnings('ignore')

In [3]:
#DATA_PATH = '/kaggle/input/birdclef-2022/'
#WORKING_PATH = '/kaggle/working/'
#MODEL_PATH = '/kaggle/input/finalvggish/'

DATA_PATH = './data/'
WORKING_PATH = './working/kernel/'
MODEL_PATH = './working/kernel/'

# Data load

In [4]:
# Load meta data
train_meta = pd.read_csv(DATA_PATH + 'train_metadata.csv')

# Load scored birds
with open(DATA_PATH + 'scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)

# Focus on 21 scored classes
labels = list(train_meta[train_meta['primary_label'].isin(
    scored_birds)]['primary_label'].unique())
labels

['akiapo',
 'aniani',
 'apapan',
 'barpet',
 'crehon',
 'elepai',
 'ercfra',
 'hawama',
 'hawcre',
 'hawgoo',
 'hawhaw',
 'hawpet1',
 'houfin',
 'iiwi',
 'jabwar',
 'maupar',
 'omao',
 'puaioh',
 'skylar',
 'warwhe1',
 'yefcan']

# Model

In [5]:
def create_cnn():

    # Instanciate model
    base_model, _, _ = vgk.get_embedding_model(hop_duration=0.25)
    dense = tf.keras.layers.Dense(128, activation='relu')(base_model.output)
    outputs = tf.keras.layers.Dense(len(labels), activation='sigmoid')(dense)

    base_model.trainable = False
    model = tf.keras.models.Model(inputs=base_model.input, outputs=outputs)

    return model

# Preprocessing

In [6]:
def extractFeatures(y, sr):
    feat = waveform_to_examples(y, sr)
        
    return feat

In [7]:
def waveform_to_examples(data, sample_rate):
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != 16000:
        data = resampy.resample(data, sample_rate, 16000)

    # Compute log mel spectrogram features.
    log_mel = log_mel_spectrogram(
        data,
        audio_sample_rate=16000,
        log_offset=0.01,
        window_length_secs=0.025,
        hop_length_secs=0.010,
        num_mel_bins=64,
        lower_edge_hertz=125,
        upper_edge_hertz=7500)

    # Frame features into examples.
    features_sample_rate = 1.0 / 0.010
    example_window_length = int(round(0.96 * features_sample_rate))
    example_hop_length = int(round(0.96 * features_sample_rate))
    log_mel_examples = frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples

In [8]:
def log_mel_spectrogram(data,
                        audio_sample_rate=8000,
                        log_offset=0.0,
                        window_length_secs=0.025,
                        hop_length_secs=0.010,
                        **kwargs):
    window_length_samples = int(round(audio_sample_rate * window_length_secs))
    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
    fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
    spectrogram = stft_magnitude(
        data,
        fft_length=fft_length,
        hop_length=hop_length_samples,
        window_length=window_length_samples)
    mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
        num_spectrogram_bins=spectrogram.shape[1],
        audio_sample_rate=audio_sample_rate, **kwargs))
    return np.log(mel_spectrogram + log_offset)

In [9]:
def frame(data, window_length, hop_length):
    num_samples = data.shape[0]
    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
    shape = (num_frames, window_length) + data.shape[1:]
    strides = (data.strides[0] * hop_length,) + data.strides
    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)

In [10]:
def stft_magnitude(signal, fft_length,
                   hop_length=None,
                   window_length=None):
    frames = frame(signal, window_length, hop_length)
    # Apply frame window to each frame. We use a periodic Hann (cosine of period
    # window_length) instead of the symmetric Hann of np.hanning (period
    # window_length-1).
    window = periodic_hann(window_length)
    windowed_frames = frames * window
    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))

In [11]:
def periodic_hann(window_length):
    return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
                               np.arange(window_length)))

In [12]:
def spectrogram_to_mel_matrix(num_mel_bins=20,
                              num_spectrogram_bins=129,
                              audio_sample_rate=8000,
                              lower_edge_hertz=125.0,
                              upper_edge_hertz=3800.0):
    nyquist_hertz = audio_sample_rate / 2.
    if lower_edge_hertz < 0.0:
        raise ValueError("lower_edge_hertz %.1f must be >= 0" %
                         lower_edge_hertz)
    if lower_edge_hertz >= upper_edge_hertz:
        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
                         (lower_edge_hertz, upper_edge_hertz))
    if upper_edge_hertz > nyquist_hertz:
        raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
                         (upper_edge_hertz, nyquist_hertz))
    spectrogram_bins_hertz = np.linspace(
        0.0, nyquist_hertz, num_spectrogram_bins)
    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
    # The i'th mel band (starting from i=1) has center frequency
    # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
    # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
    # the band_edges_mel arrays.
    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
                                 hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
    # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
    # of spectrogram values.
    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
    for i in range(num_mel_bins):
        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
        # Calculate lower and upper slopes for every spectrogram bin.
        # Line segments are linear in the *mel* domain, not hertz.
        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
                       (center_mel - lower_edge_mel))
        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
                       (upper_edge_mel - center_mel))
        # .. then intersect them with each other and zero.
        mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
                                                              upper_slope))
    # HTK excludes the spectrogram DC bin; make sure it always gets a zero
    # coefficient.
    mel_weights_matrix[0, :] = 0.0
    return mel_weights_matrix

In [13]:
def hertz_to_mel(frequencies_hertz):
    return 1127.0 * np.log(
        1.0 + (frequencies_hertz / 700.0))

# Submission

## Load model

In [14]:
pred_model = tf.keras.models.load_model(MODEL_PATH + 'finetune_da_VGGish.h5',
                                        custom_objects={'metric_config': tfa.metrics.F1Score(
                                            name='f1macro',
                                            num_classes=len(labels),
                                            average='macro'
                                        )})

## Check model

## Process

In [15]:
test_path = DATA_PATH + '/test_soundscapes/'
files = [f.split('.')[0] for f in sorted(os.listdir(test_path))]
print('Number of test soundscapes:', len(files))

Number of test soundscapes: 1


In [16]:
Class_weights = {0: 3.1081275724908664, 1: 3.7656475754076606, 2: 1.0441328733521613, 3: 3.0691726047550407,
                 4: 5.264737430672896, 5: 2.8777563180552033, 6: 5.46209686483139, 7: 1.69247564674142,
                 8: 2.124710995914189, 9: 4.432477447650233, 10: 4.419574042814324, 11: 4.854309533222602,
                 12: 1, 13: 1.2511821233781157, 14: 1.5747391996382374, 15: 4.445549529217585, 16: 1.91982088490933,
                 17: 6.173593184059532, 18: 1, 19: 1.3683585469443174, 20: 2.205399907164332}

In [17]:
def testscore(prediction):   
    result = []

    for i in range(0, len(prediction)):
        if Class_weights[i] < 2:
            if prediction[i] >= 1e-1:
                result.append(i)
        elif Class_weights[i] < 3:
            if prediction[i] >= 1e-2:
                result.append(i)
        elif Class_weights[i] < 4:
            if prediction[i] >= 1e-3:
                result.append(i)
        elif Class_weights[i] < 5:
            if prediction[i] >= 1e-4:
                result.append(i)
        elif Class_weights[i] > 5:
            if prediction[i] >= 1e-5:
                result.append(i)
                
    return result

In [18]:
data = []

for f in files:
    file_path = test_path + f + '.ogg'

    # Load audio file
    audio, sr = librosa.load(file_path)

    # Get number of samples for 5 seconds
    buffer = 5 * sr
    block_min = 5 * sr

    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    while samples_wrote < samples_total:
        # check if the buffer is not exceeding total samples
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote: (samples_wrote + buffer)]

        # check if last block is as long as previous ones
        if block.shape[0] < (block_min):
            listofzeros = np.array([0] * (block_min - block.shape[0]))
            block = np.hstack([block, listofzeros])

        # Features extraction
        block = extractFeatures(block, sr)

        # Prediction
        prediction = []
        pred = pred_model.predict(block)
        pred = pred.transpose()
        
        for i in range(0, pred.shape[0]):
            prediction.append(np.mean(pred[i]))
            
        #print('prediction', prediction)
           
        label_indexes = testscore(prediction)
        print(label_indexes)

        for b in scored_birds:
            segment_end = counter * 5
            row_id = f + '_' + b + '_' + str(segment_end)
            target = False
            for label_index in label_indexes:
                if labels[label_index] == b:
                    target = True
            data.append([row_id, target])
        counter += 1
        samples_wrote += buffer

submission_df = pd.DataFrame(data, columns=['row_id', 'target'])
submission_df.head(21)

[2, 4, 7, 13, 14]
[2, 7, 14, 18]
[14]
[2, 4, 7, 10, 13, 14]
[2, 7, 13, 14]
[2, 4, 7, 13, 14]
[2, 4, 7, 13, 14]
[2, 14]
[2, 7, 13, 14]
[13, 14]
[2, 7, 13, 14]
[2, 7, 13, 14]


Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,True
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,True
5,soundscape_453028782_elepai_5,False
6,soundscape_453028782_ercfra_5,False
7,soundscape_453028782_hawama_5,True
8,soundscape_453028782_hawcre_5,False
9,soundscape_453028782_hawgoo_5,False


In [19]:
submission_df.to_csv(WORKING_PATH + 'submission.csv', index=False)