In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import argparse
import h5py
import keras
import os
import glob
import random
import librosa
from l3embedding.audio import pcm2float
from resampy import resample
import numpy as np
from l3embedding.model import load_model
from keras.models import Model
from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling1D, MaxPooling2D, Flatten, Activation, Lambda
import tensorflow as tf
import keras.regularizers as regularizers
from kapre.time_frequency import Melspectrogram

Using TensorFlow backend.


In [17]:
def construct_cnn_L3_melspec2_spec_model(n_mels=256, n_hop = 242, n_dft = 2048,
                                         fmax=None, asr = 48000, halved_convs=False, audio_window_dur = 1):
    
    weight_decay = 1e-5

    n_frames = 1 + int((asr * audio_window_dur) / float(n_hop))
    x_a = Input(shape=(n_mels, n_frames, 1), dtype=np.float32)
    y_a = BatchNormalization()(x_a)

    # CONV BLOCK 1
    n_filter_a_1 = np.uint8(64)
    if halved_convs:
        n_filter_a_1 //= 2

    filt_size_a_1 = (np.uint8(3), np.uint8(3))
    pool_size_a_1 = (np.uint8(2), np.uint8(2))
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = np.uint8(128)
    if halved_convs:
        n_filter_a_2 //= 2

    filt_size_a_2 = (np.uint8(3), np.uint8(3))
    pool_size_a_2 = (np.uint8(2), np.uint8(2))
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = np.uint16(256)
    if halved_convs:
        n_filter_a_3 //= 2

    filt_size_a_3 = (np.uint8(3), np.uint8(3))
    pool_size_a_3 = (np.uint8(2), np.uint8(2))
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    
    if y_a.shape[-2] >= 2:
        y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = np.uint16(512)
    if halved_convs:
        n_filter_a_4 //= 2

    filt_size_a_4 = (3, 3)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer', padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    
    m = Model(inputs=x_a, outputs=y_a)
    m.name = 'audio_model'

    return m, x_a, y_a

In [3]:
def shuffle_files(iterable):
    lst = list(iterable)
    random.shuffle(lst)
    return iter(lst)

def amplitude_to_db(S, amin=1e-10, dynamic_range=80.0):
    magnitude = np.abs(S)
    power = np.square(magnitude, out=magnitude)
    ref_value = power.max()

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= log_spec.max()

    log_spec = np.maximum(log_spec, -dynamic_range)
    return log_spec

def get_melspectrogram(frame, n_fft=2048, mel_hop_length=242, samp_rate=48000, n_mels=256, fmax=None):
    S = np.abs(librosa.core.stft(frame, n_fft=n_fft, hop_length=mel_hop_length, window='hann', center=True, pad_mode='constant'))
    S = librosa.feature.melspectrogram(sr=samp_rate, S=S, n_fft=n_fft, n_mels=n_mels, fmax=fmax, power=1.0, htk=True)
    S = amplitude_to_db(np.array(S))
    return S

In [18]:
def quant_data_generator(data_dir, batch_size=512, samp_rate=48000, n_fft=2048, \
                         n_mels=256, mel_hop_length=242, hop_size=0.1, fmax=None,\
                         random_state=None, start_batch_idx=None):

    if random_state:
        random.seed(random_state)
        
    frame_length = samp_rate * 1

    batch = None
    curr_batch_size = 0
    batch_idx = 0
       
    for fname in shuffle_files(os.listdir(data_dir)):
        print(fname)
        data_batch_path = os.path.join(data_dir, fname)
        blob_start_idx = 0

        data_blob = h5py.File(data_batch_path, 'r')
        blob_size = len(data_blob['audio'])

        while blob_start_idx < blob_size:
            blob_end_idx = min(blob_start_idx + batch_size - curr_batch_size, blob_size)

            # If we are starting from a particular batch, skip computing all of
            # the prior batches
            if start_batch_idx is None or batch_idx >= start_batch_idx:
                if batch is None:
                    batch = data_blob['audio'][blob_start_idx:blob_end_idx]
                else:
                    batch = np.concatenate([batch, data_blob['audio'][blob_start_idx:blob_end_idx]])

            curr_batch_size += blob_end_idx - blob_start_idx
            blob_start_idx = blob_end_idx

            if blob_end_idx == blob_size:
                data_blob.close()

            if curr_batch_size == batch_size:
                X = []
                # If we are starting from a particular batch, skip yielding all
                # of the prior batches
                if start_batch_idx is None or batch_idx >= start_batch_idx:
                    # Convert audio to float
                    if(samp_rate==48000):
                        batch = pcm2float(batch, dtype='float32')
                    else:
                        batch = resample(pcm2float(batch, dtype='float32'), sr_orig=48000,
                                                  sr_new=samp_rate)

                    X = [get_melspectrogram(batch[i].flatten(), n_fft=n_fft, mel_hop_length=mel_hop_length,\
                                            samp_rate=samp_rate, n_mels=n_mels, fmax=fmax) for i in range(batch_size)]

                    batch = np.array(X)[:, :, :, np.newaxis]
                    #print(np.shape(batch)) #(64, 256, 191, 1)
                    return batch

                batch_idx += 1
                curr_batch_size = 0
                batch = None

In [36]:
def post_training_quantization(keras_model_path, out_path=None, quant_mode='default',\
                               input_shape=None, calibration_steps=5, samp_rate=8000, n_fft=1024,\
                               n_mels=64, mel_hop_length=160):
      
    calibrate_data_dir = '/beegfs/work/AudioSetSamples/music_train'
    def representative_dataset_gen():
        print('Calibrating.........')
        for _ in range(calibration_steps):
            x = quant_data_generator(calibrate_data_dir, batch_size=1,\
                                     samp_rate=samp_rate, n_fft=n_fft,\
                                     n_mels=n_mels, mel_hop_length=mel_hop_length)
            yield [np.array(x).astype(np.float32)]
                
    converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_model_path) 
    
    if quant_mode == 'default':       
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.inference_input_type = tf.int8
        converter.inference_output_type = tf.int8
        converter.representative_dataset = representative_dataset_gen
        
    elif quant_mode == 'size':
        converter.post_training_quantize = True
        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
    else:
        raise ValueError('Unrecognized Quantization mode!')

    tflite_model = converter.convert()
    with open(out_path, "wb") as f:
        f.write(tflite_model)

In [20]:
test_models_dir = '/scratch/sk7898/test_models'
if not os.path.isdir(test_models_dir):
        os.makedirs(test_models_dir)
            
model_names = ['16000_64_320_1024_half',\
               '16000_64_320_1024_same',\
               '8000_64_160_1024_same',\
               '8000_64_160_1024_half',\
               '16000_64_160_1024_same']

model_names = ['8000_64_160_1024_half']

for name in model_names:
    if name == '16000_64_320_1024_half': 
        samp_rate = 16000
        n_mels = 64
        n_hop = 320
        n_dft = 1024
        halved_convs = True
    elif name == '16000_64_320_1024_same':
        samp_rate = 16000
        n_mels = 64
        n_hop = 320
        n_dft = 1024
        halved_convs = False    
    elif name == '8000_64_160_1024_same':
        samp_rate = 8000
        n_mels = 64
        n_hop = 160
        n_dft = 1024
        halved_convs = False    
    elif name == '8000_64_160_1024_half':
        samp_rate = 8000
        n_mels = 64
        n_hop = 160
        n_dft = 1024
        halved_convs = True     
    else:
        samp_rate = 16000
        n_mels = 64
        n_hop = 160
        n_dft = 1024
        halved_convs = False

    keras.backend.clear_session()
    keras.backend.set_learning_phase(0)
    
    if halved_convs:
        input_repr = str(samp_rate)+'_'+str(n_mels)+'_'+str(n_hop)+'_'+str(n_dft)+'_half'
    else:
        input_repr = str(samp_rate)+'_'+str(n_mels)+'_'+str(n_hop)+'_'+str(n_dft)

    model_output_path = os.path.join(test_models_dir, 'test_l3_audio_{}.h5'.format(input_repr))
    audio_spec_embed_model, x_a, _ = construct_cnn_L3_melspec2_spec_model(n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, \
                                                                        halved_convs=halved_convs, asr=samp_rate)
    print('Model Representation: ', name)
    print('Input Shape: ', x_a.shape)
    audio_spec_embed_model.summary()
    audio_spec_embed_model.save(model_output_path)

Model Representation:  8000_64_160_1024_half
Input Shape:  (?, 64, 51, 1)
Model: "audio_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64, 51, 1)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64, 51, 1)         4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 51, 32)        320       
_________________________________________________________________
batch_normalization_2 (Batch (None, 64, 51, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 64, 51, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 51, 32)        9248      
_______________________________________________

In [37]:
input_shapes = [(1, 64, 51, 1)]
test_models_dir = '/scratch/sk7898/test_models'
keras_models = ['test_l3_audio_8000_64_160_1024_half.h5']

for input_shape, model in zip(input_shapes, keras_models):
    input_repr = model.strip('.h5')
    keras_model_path = os.path.join(test_models_dir, model)
    quant_output_path = os.path.join(test_models_dir, 'quant_{}.tflite'.format(input_repr))

    calibration_steps = 5
    quant_mode = 'default'

    post_training_quantization(keras_model_path, out_path=quant_output_path, quant_mode=quant_mode, \
                               input_shape=input_shape, calibration_steps=calibration_steps)

INFO:tensorflow:Froze 48 variables.
INFO:tensorflow:Converted 48 variables to const ops.
Calibrating.........
20180180_6_66.h5
20180270_0_119.h5
20187003_3_53.h5
20180276_6_161.h5
20187008_0_12.h5


In [38]:
interpreter = tf.lite.Interpreter(model_path=str(quant_output_path))
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_shape = input_details[0]['shape'][1:]
output_shape = output_details[0]['shape'][1:]
input_index = input_details[0]['index']
output_index = output_details[0]['index']

interpreter.allocate_tensors()

print("== Input details ==")
print(interpreter.get_input_details()[0])
print("type:", input_details[0]['dtype'])
print("\n== Output details ==")
print(interpreter.get_output_details()[0])

== Input details ==
{'name': 'input_1', 'index': 20, 'shape': array([ 1, 64, 51,  1], dtype=int32), 'dtype': <class 'numpy.int8'>, 'quantization': (0.3137255012989044, 127)}
type: <class 'numpy.int8'>

== Output details ==
{'name': 'audio_embedding_layer/BiasAdd', 'index': 7, 'shape': array([  1,   8,   6, 256], dtype=int32), 'dtype': <class 'numpy.int8'>, 'quantization': (1.5971095561981201, 4)}
