In [170]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Model
import efficientnet.keras as efn 
import librosa
import librosa.display as display
import os
import matplotlib.pyplot as plt
import matplotlib
from PIL import Image
from sklearn.utils import class_weight
import warnings
from tqdm import tqdm

from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
from kapre.augmentation import AdditiveNoise
from kapre.time_frequency import Spectrogram

from python_speech_features import mfcc
from mutagen.mp3 import MP3
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

from zipfile import ZipFile

augmenter = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

%matplotlib inline

#!rm -r train_data
#!rm -r val_data
#!rm -r models
#!mkdir models

# suppress warnings
warnings.filterwarnings("ignore")

SOUND_DIR = "data/birdsong-recognition/train_audio/"

In [171]:
# download and unzip model
!wget https://s3.jp-tok.cloud-object-storage.appdomain.cloud/w251-models/efficientNet_20.zip
    
with ZipFile('efficientNet_20.zip', 'r') as zipObj:
    zipObj.extractall()

--2020-08-01 22:37:51--  https://s3.jp-tok.cloud-object-storage.appdomain.cloud/w251-models/efficientNet_20.zip
Resolving s3.jp-tok.cloud-object-storage.appdomain.cloud (s3.jp-tok.cloud-object-storage.appdomain.cloud)... 162.133.118.49
Connecting to s3.jp-tok.cloud-object-storage.appdomain.cloud (s3.jp-tok.cloud-object-storage.appdomain.cloud)|162.133.118.49|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120919216 (115M) [application/zip]
Saving to: ‘efficientNet_20.zip’


2020-08-01 22:38:20 (4.00 MB/s) - ‘efficientNet_20.zip’ saved [120919216/120919216]



In [172]:
# load model
model_path = "efficientNet_20/"
model = tf.keras.models.load_model(model_path)

In [173]:
# check model architecture
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
efficientnet-b3 (Model)      (None, 7, 7, 1536)        10783528  
_________________________________________________________________
global_average_pooling2d (Gl (None, 1536)              0         
_________________________________________________________________
flatten (Flatten)            (None, 1536)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1536)              0         
_________________________________________________________________
softmax (Dense)              (None, 20)                30740     
Total params: 10,814,268
Trainable params: 10,726,972
Non-trainable params: 87,296
____________________________________________

In [174]:
# function to perform inference
def inference(model, signal, sr, classes, top=5):
    
    # transform audio signal to mel Spectrogram, 
    # pls note I down sample with 16000 Hz
    S = Melspectrogram(n_dft=1024, n_hop=256, input_shape=(1, signal.shape[0]),
                       padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2,
                       power_melgram=2.0, return_decibel_melgram=True,
                       trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy()
    
    S = S.reshape(S.shape[1], S.shape[2])
    
    # save tmp image with cmap = "inferno", which is the cmap used in training
    matplotlib.image.imsave("tmp/inference" + ".png", S, cmap='inferno')
    
    # reload tmp image and convert to numpy array
    img = Image.open('tmp/inference.png')
    img = img.resize((224, 224), Image.ANTIALIAS)
    melspec = np.array(img)
    melspec = melspec[...,:3] / 255
    
    # model predictions
    preds = model(melspec.reshape(1, 224, 224, 3)).numpy()
    
    # print top predictions
    for pred in preds:
        top_indices = pred.argsort()[-top:][::-1]
        result = [[classes[i], pred[i]] for i in top_indices]
        result.sort(key=lambda x: x[1], reverse=True)
        
    return result

In [175]:
BIRDS = ["amered", "annhum", "belkin1", "blugrb1", "brthum", 
         "cedwax", "commer", "gockin", "gryfly", "horlar", 
         "moudov", "olsfly", "pasfly", "semsan", "sposan", 
         "vigswa", "wewpew", "whbnut", "wilsni1", "yelwar"]

### Inference on sample audio (first 10s of audio)

In [176]:
file = "data/birdsong-recognition/train_audio/brthum/XC132906.mp3"
signal, sr = librosa.load(file, sr=16000, duration=10)
inference(model, signal, sr, BIRDS)

[['brthum', 1.0],
 ['wewpew', 9.0189474e-11],
 ['gockin', 9.897439e-12],
 ['gryfly', 3.3648594e-12],
 ['amered', 1.5433536e-12]]

In [189]:
file = "data/birdsong-recognition/train_audio/whbnut/XC252947.mp3"
signal, sr = librosa.load(file, sr=16000, duration=10)
inference(model, signal, sr, BIRDS)

[['whbnut', 0.9993772],
 ['semsan', 0.00040202064],
 ['wilsni1', 8.66735e-05],
 ['moudov', 3.9834027e-05],
 ['blugrb1', 2.6263682e-05]]

In [178]:
file = "data/birdsong-recognition/train_audio/wilsni1/XC147531.mp3"
signal, sr = librosa.load(file, sr=16000, duration=10)
inference(model, signal, sr, BIRDS)

[['wilsni1', 0.9999999],
 ['semsan', 5.219136e-08],
 ['gryfly', 4.9135597e-08],
 ['vigswa', 5.4059752e-09],
 ['brthum', 2.617901e-09]]

### Inference on sample audio (random 10s of audio)

In [203]:
file = "data/birdsong-recognition/train_audio/brthum/XC132913.mp3"
signal, sr = librosa.load(file, sr=16000)

start= int(np.random.uniform(0, len(signal) // sr - 10))
signal = signal[sr*start:sr*(start+10)]
inference(model, signal, sr, BIRDS)

[['brthum', 1.0],
 ['gockin', 1.4711168e-09],
 ['annhum', 2.6356056e-10],
 ['yelwar', 2.317562e-10],
 ['wewpew', 2.0799469e-10]]

In [207]:
file = "data/birdsong-recognition/train_audio/whbnut/XC290146.mp3"
signal, sr = librosa.load(file, sr=16000)

start= int(np.random.uniform(0, len(signal) // sr - 10))
signal = signal[sr*start:sr*(start+10)]
inference(model, signal, sr, BIRDS)

[['whbnut', 0.9293932],
 ['moudov', 0.029889416],
 ['belkin1', 0.02207578],
 ['blugrb1', 0.007992051],
 ['wilsni1', 0.0015644863]]

In [209]:
file = "data/birdsong-recognition/train_audio/wilsni1/XC186352.mp3"
signal, sr = librosa.load(file, sr=16000)

start= int(np.random.uniform(0, len(signal) // sr - 10))
signal = signal[sr*start:sr*(start+10)]
inference(model, signal, sr, BIRDS)

[['wilsni1', 0.9842981],
 ['horlar', 0.015352972],
 ['semsan', 0.00021072502],
 ['gryfly', 4.8712995e-05],
 ['vigswa', 2.3974026e-05]]