In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Model
import efficientnet.keras as efn 
import librosa
import librosa.display as display
import os
import matplotlib.pyplot as plt
import matplotlib
from PIL import Image
from sklearn.utils import class_weight
import warnings
from tqdm import tqdm

from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
from kapre.augmentation import AdditiveNoise
from kapre.time_frequency import Spectrogram

from python_speech_features import mfcc
from mutagen.mp3 import MP3
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

from zipfile import ZipFile

augmenter = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

%matplotlib inline

#!rm -r train_data
#!rm -r val_data
#!rm -r models
#!mkdir models

# suppress warnings
warnings.filterwarnings("ignore")

SOUND_DIR = "data/birdsong-recognition/train_audio/"

Using TensorFlow backend.


In [2]:
import timeit

start = timeit.default_timer()

for _ in range(10):
    stop = timeit.default_timer()
    print(stop-start)
    
# 6600

3.981199999891771e-05
0.00016662799999878075
0.0001968080000001038
0.00022919999999970742
0.00024041100000005144
0.00024997599999920794
0.00026257399999884967
0.0002716189999993901
0.0002804300000001092
0.00028918499999974756


In [3]:
BIRDS = os.listdir("data/birdsong-recognition/train_audio/")
BIRDS = [b for b in BIRDS if b[0]!="."]
print(len(BIRDS))

IM_SIZE = (224, 224, 3, )

264


In [4]:
# load model
model_path = "models/efficientNet/"
model = tf.keras.models.load_model(model_path)

model_path = "models/melSpec/"
model_melSpec = tf.keras.models.load_model(model_path)



In [5]:
# check model architecture
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
stem_conv (Conv2D)              (None, 112, 112, 40) 1080        input_3[0][0]                    
__________________________________________________________________________________________________
stem_bn (BatchNormalization)    (None, 112, 112, 40) 160         stem_conv[0][0]                  
__________________________________________________________________________________________________
stem_activation (Activation)    (None, 112, 112, 40) 0           stem_bn[0][0]                    
____________________________________________________________________________________________

In [73]:
# function to perform inference
def inference(model, signal, sr, classes, top=5):
    
    # transform audio signal to mel Spectrogram, 
    # pls note I down sample with 16000 Hz
    S = Melspectrogram(n_dft=1024, n_hop=256, input_shape=(1, signal.shape[0]),
                       padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2,
                       power_melgram=2.0, return_decibel_melgram=True,
                       trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy()
    
    S = S.reshape(S.shape[1], S.shape[2])
    
    # save tmp image with cmap = "inferno", which is the cmap used in training
    matplotlib.image.imsave("tmp/inference" + ".png", S, cmap='inferno')
    
    # reload tmp image and convert to numpy array
    img = Image.open('tmp/inference.png')
    img = img.resize((224, 224), Image.ANTIALIAS)
    melspec = np.array(img)
    melspec = melspec[...,:3] / 255
    
    # model predictions
    preds = model(melspec.reshape(1, 224, 224, 3)).numpy()
    
    # print top predictions
    for pred in preds:
        top_indices = pred.argsort()[-top:][::-1]
        result = [[classes[i], pred[i]] for i in top_indices]
        result.sort(key=lambda x: x[1], reverse=True)
        
    return result

In [5]:
sr = 16000

def make_prediction(model_melSpec, model, signal, birds):
    
    #S = Melspectrogram(n_dft=1024, n_hop=256, input_shape=(1, signal.shape[0]),
    #                   padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2,
    #                   power_melgram=2.0, return_decibel_melgram=True,
    #                   trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy()
    S = model_melSpec(signal.reshape(1, 1, -1)).numpy()
    
    S = S.reshape(S.shape[1], S.shape[2])
    
    matplotlib.image.imsave("inference" + ".png", S, cmap='inferno')
    
    # reload tmp image and convert to numpy array
    img = Image.open('inference.png')
    img = img.resize((224, 224), Image.ANTIALIAS)
    melspec = np.array(img)
    melspec = melspec[...,:3] / 255
    
    # model predictions
    preds = model(melspec.reshape(1, 224, 224, 3)).numpy()
    
    # print top predictions
    #for pred in preds:
    #    top_indices = pred.argsort()[-top:][::-1]
    #    result = [[classes[i], pred[i]] for i in top_indices]
    #    result.sort(key=lambda x: x[1], reverse=True)
    
    if np.max(preds) <= 0.50:
        return "nocall"
    else:
        return birds[np.argmax(preds)]

### Inference on sample audio (first 10s of audio)

In [6]:
file = "data/birdsong-recognition/train_audio/brthum/XC132906.mp3"
signal = load_test_clip(file, 0, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

NameError: name 'load_test_clip' is not defined

In [118]:
file = "data/birdsong-recognition/train_audio/brthum/XC132906.mp3"
signal = load_test_clip(file, 0, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'brthum'

In [119]:
file = "data/birdsong-recognition/train_audio/whbnut/XC252947.mp3"
signal = load_test_clip(file, 0, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'whbnut'

In [244]:
file = "sample/barswa/XC575747.mp3"
signal = load_test_clip(file, 2, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'barswa'

### Inference on sample audio (random 10s of audio)

In [174]:
file = "data/birdsong-recognition/train_audio/plsvir/XC283090.mp3"
start= int(np.random.uniform(0, len(signal) // sr - 5))
signal = load_test_clip(file, start, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'plsvir'

In [127]:
file = "data/birdsong-recognition/train_audio/whbnut/XC290146.mp3"
start= int(np.random.uniform(0, len(signal) // sr - 5))
signal = load_test_clip(file, start, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'whbnut'

In [128]:
file = "data/birdsong-recognition/train_audio/wilsni1/XC186352.mp3"
start= int(np.random.uniform(0, len(signal) // sr - 5))
signal = load_test_clip(file, start, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'wilsni1'

In [233]:
file = "sample/barswa/XC570904.mp3"
start= int(np.random.uniform(0, len(signal) // sr - 5))
signal = load_test_clip(file, start, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'barswa'

In [245]:
test_clip_dict = {}
test_clip_dict["clip1"] = 1


In [248]:
np.random.choice(BIRDS)

'rebsap'

In [28]:
global test_clip_dict
test_clip_dict = {}

def load_test_clip(path, start_time, duration=5):
    
    global test_clip_dict
    
    if (path + "_" + str(start_time)) in test_clip_dict:
        #signal = test_clip_dict[path][sr*start_time:sr*(start_time+duration)]
        signal = test_clip_dict[path+"_"+str(start_time)]
    else:
        test_clip_dict = {}
        clip = librosa.load(path, sr=sr, offset=start_time, duration=120)[0]
        
        if len(clip) < sr * 120:
            clip = list(clip) + [0 for i in range(sr*120 - len(clip))]
            clip = np.array(clip, dtype=np.float32)
        
        for i in range(24):
            test_clip_dict[path+"_"+str(start_time+i*5)] = clip[sr*(i*5):sr*(i*5+5)]
        
        #test_clip_dict[path] = librosa.load(path, sr=sr)[0]
        #signal = test_clip_dict[path][sr*start_time:sr*(start_time+duration)]
        signal = test_clip_dict[path+"_"+str(start_time)]
    
    #signal = librosa.load(path, sr=sr, offset=start_time, duration=duration)[0]
    
    # pad signal
    #if len(signal) < sr * 5:
    #    signal = list(signal) + [0 for i in range(sr*5 - len(signal))]
    #    signal = np.array(signal, dtype=np.float32)
    
    return signal

def load_test_clip_site3(path):
    
    clip = librosa.load(path, sr=sr)[0]
    if len(clip) < sr * 5:
        clip = list(clip) + [0 for i in range(sr*5 - len(clip))]
        clip = np.array(clip, dtype=np.float32)
        
    return clip

def make_prediction(model_melSpec, model, signal, birds, top=5):
    
    #S = Melspectrogram(n_dft=1024, n_hop=256, input_shape=(1, signal.shape[0]),
    #                   padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2,
    #                   power_melgram=2.0, return_decibel_melgram=True,
    #                   trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy()
    S = model_melSpec(signal.reshape(1, 1, -1)).numpy()
    
    S = S.reshape(S.shape[1], S.shape[2])
    
    matplotlib.image.imsave("inference" + ".png", S, cmap='inferno')
    
    # reload tmp image and convert to numpy array
    img = Image.open('inference.png')
    img = img.resize((224, 224), Image.ANTIALIAS)
    melspec = np.array(img)
    melspec = melspec[...,:3] / 255
    
    # model predictions
    preds = model(melspec.reshape(1, 224, 224, 3)).numpy()
    
    if np.max(preds) >= 0.80:
        return birds[np.argmax(preds)]
    else:
        return "nocall"
    
    # print top predictions
    #for pred in preds:
    #    top_indices = pred.argsort()[-top:][::-1]
    #    result = [[birds[i], pred[i]] for i in top_indices]
    #    result.sort(key=lambda x: x[1], reverse=True)
    
    #return result

In [29]:
sr = 16000
file = "data/birdsong-recognition/train_audio/amecro/XC109768.mp3"
#start= int(np.random.uniform(0, len(signal) // sr - 5))
signal = load_test_clip(file, 0, 5)
make_prediction(model_melSpec, model, signal, BIRDS)

'nocall'

In [27]:
file = "data/birdsong-recognition/example_test_audio/BLKFR-10-CPL_20190611_093000.pt540.mp3"
sr = 16000
#start= int(np.random.uniform(0, len(signal) // sr - 5))
#start = 40
signal = load_test_clip_site3(file)

multi_pred = []
for i in range(len(signal) // (sr*5)):
    mySignal = signal[sr*(i*5):sr*(i*5+5)]
    multi_pred.append(make_prediction(model_melSpec, model, mySignal, BIRDS))

In [28]:
multi_pred = sorted(list(set(multi_pred) - set(["nocall"])))
multi_pred

['bawwar nocall',
 'bkbwar nocall',
 'bkcchi nocall',
 'bkpwar',
 'bkpwar camwar',
 'bkpwar nocall',
 'camwar',
 'camwar bkpwar',
 'camwar nocall',
 'canwar nocall',
 'casfin nocall',
 'cedwax nocall',
 'comgol',
 'eucdov nocall',
 'evegro',
 'evegro nocall',
 'fiespa',
 'fiespa nocall',
 'foxspa nocall',
 'gockin',
 'gockin nocall',
 'grhowl nocall',
 'gryfly nocall',
 'hamfly nocall',
 'hamfly yebfly',
 'lesnig nocall',
 'mouchi',
 'norpar nocall',
 'purfin nocall',
 'rufhum',
 'rufhum nocall',
 'swahaw nocall',
 'westan nocall',
 'whtspa nocall',
 'yerwar nocall']

In [26]:
" ".join(multi_pred)

'bkpwar camwar cedwax comgol evegro fiespa gockin mouchi rufhum westan'

In [43]:
from sys import getsizeof
a = [0] * 16000 * 60 * 12
b = np.array(a)
b.nbytes / 1024 / 1024 / 1024

0.0858306884765625

In [11]:
BIRDS = os.listdir("MelSpectrogram/train_data_50/")
BIRDS = sorted(BIRDS)

In [12]:
len(BIRDS)

264

In [39]:
file = "data/birdsong-recognition/example_test_audio/BLKFR-10-CPL_20190611_093000.pt540.mp3"
s, sr = librosa.load(file, offset=0, sr=16000)

3.385100001196406e-05
0.00013453800002594107
0.0001425350000090475
0.00014899200002105317
0.00015517500000328255
0.00016109300000266558
0.00019453400000202237
0.0002097740000124304
0.000221269000007851
0.0002479930000163222
0.00025968600002101994
0.0002670950000265293
0.00027324000001272
0.00042437400000494563
0.0004348350000213941
0.00044157500002484085
0.00044775900002491653
0.0004539650000197071
0.00046186500000544584
0.0007014510000260543
0.0007108090000258471
0.0007172710000133975
0.0007233010000220474
0.0007292000000234111
0.0007350560000247697
0.0007631710000168823
0.0007730960000174036
0.0007807300000024497
0.000792518000025666
0.0008063370000002124
0.000815711000001329
0.0008481460000098195
0.0008677140000088457
0.0008750110000050881
0.0008966180000129498
0.0009168310000120528
0.0009242970000116202
0.0009377770000185137
0.000947755000026973
0.000973379000015484
0.0009829070000080264
0.000991829000014377
0.0011004330000048412
0.0011132280000083483
0.0011229390000266903
0.001132

0.2514614940000115
0.25148533400002293
0.2514959090000275
0.2515053960000273
0.25151480800002446
0.2515236600000037
0.2515324910000061
0.2515415950000204
0.25155032800000754
0.2515590610000231
0.25156793300001823
0.25157693000002723
0.2515861190000237
0.2515950920000023
0.2516037800000106
0.2516125290000275
0.2517340300000228
0.2517418070000019
0.25175042600000097
0.2517591660000278
0.25176776500001097
0.2517765340000153
0.251785032000015
0.25179358600001933
0.2518022790000032
0.2518109220000042
0.2518195600000013
0.2518280710000056
0.25183654900001784
0.2518450730000268
0.2518536020000113
0.251862162000009
0.25187062200001265
0.2518792820000044
0.251887910000022
0.2518964370000276
0.25190513900000155
0.2519137430000171
0.2519223480000221
0.2519309660000033
0.2519395890000169
0.2519484570000259
0.25195745300001704
0.2519661640000095
0.25197493699999995
0.2519841370000222
0.2519927830000199
0.25200143800000774
0.25201006200001075
0.25201879500002633
0.25202748100002736
0.252036318000023

0.42860122000001866
0.4781761210000184
0.4781909580000274
0.47820302800002423
0.47821131599999944
0.4782203240000058
0.4782291590000227
0.4782396250000147
0.4782514340000148
0.4782618240000147
0.4782706400000052
0.4782794750000221
0.478294644000016
0.4783037120000131
0.47831187900001737
0.478319818000017
0.47832782300000076
0.47833936200001403
0.47835106800002336
0.4783586370000137
0.4783772130000159
0.4783841320000022
0.4783899380000207
0.47839576300000886
0.47840154100001087
0.4784074000000089
0.4784134010000116
0.4784190960000103
0.47842486600001166
0.478439786000024
0.47844864700002177
0.4784624470000267
0.4784886700000186
0.47850262700001167
0.4785129430000268
0.478521862000008
0.4785306600000183
0.4785395830000141
0.4785482060000277
0.47855684400002474
0.4785657850000007
0.47857450500001164
0.4785832950000213
0.47859202500001174
0.4786006680000128
0.4786099260000185
0.47861867400001756
0.4786273410000206
0.4786361040000031
0.47864467300001934
0.4786534979999999
0.4786622710000188

0.6995365630000094
0.6995568210000158
0.6995643040000061
0.6995946760000038
0.6996045840000136
0.699610851000017
0.699616801000019
0.6996226680000177
0.6996284260000039
0.6996341730000211
0.6996494980000136
0.6996576350000225
0.6996700130000022
0.6996904580000205
0.699707324000002
0.6997141810000187
0.6997200810000095
0.6997259140000267
0.6997317520000195
0.6997374030000003
0.6997502650000058
0.6997587420000002
0.6997926220000181
0.6998057230000256
0.6998136760000193
0.6998223440000118
0.6998299239999994
0.6998370180000109
0.6998430670000175
0.6998502700000131
0.6998743420000153
0.6998833310000236
0.6998894950000079
0.6998991960000183
0.6999082590000114
0.6999156270000242
0.6999288950000278
0.6999377440000103
0.699945348
0.6999588130000234
0.6999665550000032
0.6999751070000002
0.6999829840000018
0.6999959080000053
0.7000019250000094
0.7000078540000061
0.7000136010000233
0.7000208830000076
0.7000497420000045
0.7000724060000039
0.7000920240000141
0.7001017000000047
0.7001160620000064
0.7

0.8694547730000011
0.9162108020000233
0.9162228000000141
0.916231348000025
0.9162417610000091
0.916250596000026
0.9162597770000218
0.9162683390000268
0.9162835540000174
0.9162951960000214
0.9163038840000013
0.9163128370000209
0.9163212290000047
0.9163302970000018
0.9163400179999996
0.9163487860000146
0.9163572850000037
0.9163658260000034
0.9163742740000203
0.9163827170000047
0.916391152000017
0.9163994780000166
0.9164083480000045
0.9164166280000074
0.916425060000023
0.9164313100000072
0.9164385130000028
0.9164469970000084
0.9164554880000253
0.9164638160000038
0.9164724070000148
0.9164810270000032
0.9164872790000231
0.9164945980000141
0.9165032480000264
0.9165115380000088
0.9165200760000118
0.9168024700000217
0.9168103530000167
0.9168190590000052
0.9168317040000034
0.9168398890000162
0.9168483550000133
0.9168601070000193
0.9168684410000196
0.9168807640000125
0.9168939650000141
0.9171614160000274
0.9171693930000231
0.9171781970000268
0.9171900900000196
0.917199050000022
0.917207435000023

In [6]:
def make_prediction_from_MelSpec(model, path, birds):
    
    #S = Melspectrogram(n_dft=1024, n_hop=256, input_shape=(1, signal.shape[0]),
    #                   padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2,
    #                   power_melgram=2.0, return_decibel_melgram=True,
    #                   trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy()
    #S = model_melSpec(signal.reshape(1, 1, -1)).numpy()
    
    #S = S.reshape(S.shape[1], S.shape[2])
    
    #matplotlib.image.imsave("inference" + ".png", S, cmap='inferno')
    
    # reload tmp image and convert to numpy array
    img = Image.open(path)
    img = img.resize((224, 224), Image.ANTIALIAS)
    melspec = np.array(img)
    melspec = melspec[...,:3] / 255
    
    # model predictions
    preds = model(melspec.reshape(1, 224, 224, 3)).numpy()
    
    #if np.max(preds) >= 0.90:
    #    return birds[np.argmax(preds)]
    #else:
    #    return "nocall"
    return birds[np.argmax(preds)]

In [16]:
# model error study
sr = 16000

y_dict = {}

for bird in tqdm(sorted(os.listdir("MelSpectrogram/val_data/"))):
    if bird[0] != ".":
        tmp_list = []
        for f in os.listdir(os.path.join("MelSpectrogram/val_data/", bird)):
            if f[0] != ".":
                tmp_list.append(make_prediction_from_MelSpec(model, 
                                                             os.path.join("MelSpectrogram/val_data/", bird, f), 
                                                             BIRDS))
                
        y_dict[bird] = tmp_list

100%|██████████| 264/264 [27:05<00:00,  6.16s/it]


In [17]:
from sklearn.metrics import accuracy_score

print(accuracy_score(["aldfly" for _ in range(len(y_dict["aldfly"]))], y_dict["aldfly"]))

0.796875


In [18]:
acc_dict = {}
for bird in list(y_dict.keys()):
    acc_dict[bird] = accuracy_score([bird for _ in range(len(y_dict[bird]))], y_dict[bird])

In [19]:
acc_dict

{'aldfly': 0.796875,
 'ameavo': 0.6190476190476191,
 'amebit': 0.6060606060606061,
 'amecro': 0.9230769230769231,
 'amegfi': 0.5,
 'amekes': 0.8,
 'amepip': 0.8,
 'amered': 0.7538461538461538,
 'amerob': 0.7272727272727273,
 'amewig': 0.8333333333333334,
 'amewoo': 0.88,
 'amtspa': 0.6938775510204082,
 'annhum': 0.82,
 'astfly': 0.8404255319148937,
 'baisan': 0.7647058823529411,
 'baleag': 0.7142857142857143,
 'balori': 0.5454545454545454,
 'banswa': 0.7849462365591398,
 'barswa': 0.8461538461538461,
 'bawwar': 0.7954545454545454,
 'belkin1': 0.5925925925925926,
 'belspa2': 0.6851851851851852,
 'bewwre': 0.875,
 'bkbcuc': 0.5416666666666666,
 'bkbmag1': 0.8378378378378378,
 'bkbwar': 0.6666666666666666,
 'bkcchi': 0.7714285714285715,
 'bkchum': 0.7692307692307693,
 'bkhgro': 0.7857142857142857,
 'bkpwar': 0.4146341463414634,
 'bktspa': 0.84375,
 'blkpho': 0.8448275862068966,
 'blugrb1': 0.76,
 'blujay': 0.8260869565217391,
 'bnhcow': 0.7162162162162162,
 'boboli': 0.6986301369863014,
 

In [20]:
import pickle
a_file = open("error_analysis_b1.pkl", "wb")
pickle.dump(acc_dict, a_file)
a_file.close()

In [21]:
a_file = open("error_analysis_b1.pkl", "rb")
output = pickle.load(a_file)
output

{'aldfly': 0.796875,
 'ameavo': 0.6190476190476191,
 'amebit': 0.6060606060606061,
 'amecro': 0.9230769230769231,
 'amegfi': 0.5,
 'amekes': 0.8,
 'amepip': 0.8,
 'amered': 0.7538461538461538,
 'amerob': 0.7272727272727273,
 'amewig': 0.8333333333333334,
 'amewoo': 0.88,
 'amtspa': 0.6938775510204082,
 'annhum': 0.82,
 'astfly': 0.8404255319148937,
 'baisan': 0.7647058823529411,
 'baleag': 0.7142857142857143,
 'balori': 0.5454545454545454,
 'banswa': 0.7849462365591398,
 'barswa': 0.8461538461538461,
 'bawwar': 0.7954545454545454,
 'belkin1': 0.5925925925925926,
 'belspa2': 0.6851851851851852,
 'bewwre': 0.875,
 'bkbcuc': 0.5416666666666666,
 'bkbmag1': 0.8378378378378378,
 'bkbwar': 0.6666666666666666,
 'bkcchi': 0.7714285714285715,
 'bkchum': 0.7692307692307693,
 'bkhgro': 0.7857142857142857,
 'bkpwar': 0.4146341463414634,
 'bktspa': 0.84375,
 'blkpho': 0.8448275862068966,
 'blugrb1': 0.76,
 'blujay': 0.8260869565217391,
 'bnhcow': 0.7162162162162162,
 'boboli': 0.6986301369863014,
 

In [24]:
q = np.quantile(list(output.values()), 0.25)
len([k for k,v in output.items() if v < q])

66