In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import LearningRateScheduler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from kapre.time_frequency import Melspectrogram
import warnings
import os
from tqdm import tqdm
import librosa

# function to convert TF tensor to Numpy array
def toNumpy(tf_tensor):
    
    session = tf.keras.backend.get_session()
    
    np_array = tf_tensor.eval(session=session)
    
    return np_array

# suppress warnings
warnings.filterwarnings("ignore")

### Data preprocessing

In [21]:
audio_path = "data/birdsong-recognition/train_audio/"
BIRDS = os.listdir(audio_path)[0:5]    # start with 5 birds

In [22]:
# ***** number of mel bands set to 13 instead of 224, trying to lower dimension of data *****
sr = 22050
layerMelSpectrogram = Melspectrogram(n_dft=1024, 
                       n_hop=256,
                       input_shape=(1, sr*10),
                       padding='same', sr=sr, n_mels=13, fmin=1400, fmax=sr/2,
                       power_melgram=2.0, return_decibel_melgram=True,
                       trainable_fb=False, trainable_kernel=False)

In [23]:
# load training data & create MelSpectrogram
x_train = []
y_train = []

for bird in tqdm(BIRDS):
    files = os.listdir(os.path.join(audio_path, bird))
    files = [f for f in files if f[0]!="."]
    
    for file in files:
        signal, sr = librosa.load(os.path.join(audio_path, bird, file), duration=10, sr=16000)
        
        # add 0 padding
        signal = list(signal) + [0 for i in range(sr*10 - len(signal))]
        
        S = layerMelSpectrogram(np.array(signal, dtype=np.float32).reshape(1, 1, -1))
        S = toNumpy(S)
        S = S.reshape(-1)
        
        x_train.append(list(S))
        y_train.append(bird)

100%|██████████| 5/5 [07:06<00:00, 85.20s/it]


In [24]:
# check x_train shape
x_train = np.array(x_train)
x_train.shape

(382, 8125)

In [25]:
# standardize training data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)
x_train_std.shape

(382, 8125)

### Variational Autoencoder

In [42]:
# dimension of latent vector
n_z = 16

# encoder
inputs = Input(shape=(x_train.shape[1],))
h_q = Dense(5000, activation='relu')(inputs)
mu = Dense(n_z, activation='linear')(h_q)
log_sigma_sq = Dense(n_z, activation='linear')(h_q)

encoder_out = tf.keras.layers.concatenate([mu, log_sigma_sq])

In [43]:
# sampling
def sample_z(mu, log_sigma_sq):
    eps = tf.random.normal(shape=tf.shape(mu))
    return mu + tf.sqrt(tf.exp(log_sigma_sq)) * eps

z = sample_z(mu, log_sigma_sq)

In [44]:
# decoder
decoder_hidden = Dense(5000, activation='relu')
decoder_out = Dense(x_train.shape[1], activation='sigmoid')

h_p = decoder_hidden(z)
outputs = decoder_out(h_p)

In [45]:
# Overall VAE model, for reconstruction and training
vae = Model(inputs, outputs)

# Encoder model, to encode input into latent variable
# We use the mean as the output as it is the center point, the representative of the gaussian
encoder = Model(inputs, encoder_out)

# Generator model, generate new data given latent variable z
d_in = Input(shape=(n_z,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)

In [46]:
def vae_loss(y_true, y_pred):
    """ loss = reconstruction loss + KL divergence for each data in minibatch """
    
    recon_loss = -tf.reduce_sum(y_true * tf.math.log(1e-8 + y_pred) + 
                                (1-y_true) * tf.math.log(1e-8 + 1 - y_pred), 1)

    kl_loss = 0.5 * tf.reduce_sum(tf.exp(log_sigma_sq) + mu**2 - 1. - log_sigma_sq, 1)
    
    return tf.reduce_mean(recon_loss + kl_loss)

In [47]:
vae.compile(optimizer='adam', loss=vae_loss)
vae.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 8125)]       0                                            
__________________________________________________________________________________________________
dense_13 (Dense)                (None, 5000)         40630000    input_6[0][0]                    
__________________________________________________________________________________________________
dense_14 (Dense)                (None, 16)           80016       dense_13[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_Shape_2 (TensorFlow [(2,)]               0           dense_14[0][0]                   
____________________________________________________________________________________________

In [48]:
#tf.config.experimental_run_functions_eagerly(True)
vae.reset_states()
vae.fit(x_train, x_train, 
        batch_size=32, 
        epochs=10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 382 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fab919a0590>