In [6]:
import argparse
import keras
import os
import glob
import numpy as np
from l3embedding.model import load_model
from keras.models import Model
from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling1D, MaxPooling2D, Flatten, Activation, Lambda
import tensorflow as tf
import keras.regularizers as regularizers
from kapre.time_frequency import Melspectrogram

In [26]:
def construct_cnn_L3_melspec2_spec_model(n_mels=256, n_hop = 242, n_dft = 2048,
                                         fmax=None, asr = 48000, halved_convs=False, audio_window_dur = 1):
    
    weight_decay = 1e-5

    n_frames = 1 + int((asr * audio_window_dur) / float(n_hop))
    x_a = Input(shape=(n_mels, n_frames, 1), dtype='float32')
    y_a = BatchNormalization()(x_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64
    if halved_convs:
        n_filter_a_1 //= 2

    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128
    if halved_convs:
        n_filter_a_2 //= 2

    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256
    if halved_convs:
        n_filter_a_3 //= 2

    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    
    if y_a.shape[-2] >= 2:
        y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512
    if halved_convs:
        n_filter_a_4 //= 2

    filt_size_a_4 = (3, 3)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer', padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    
    m = Model(inputs=x_a, outputs=y_a)
    m.name = 'audio_model'

    return m, x_a, y_a

In [23]:
def post_training_quantization(keras_model_path, out_path=None, quant_mode='default',\
                               input_shape=None, calibration_steps=1024):
      
    def representative_dataset_gen():

        print('Calibrating.........')
        for _ in range(calibration_steps):
            x = np.ones_like(input_shape, dtype=np.float32)
            yield [x]
                
    converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_model_path) 
    
    if quant_mode == 'default':       
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.default_ranges_stats = (0, 1)
        #converter.representative_dataset = representative_dataset_gen
        
    elif quant_mode == 'size':
        converter.post_training_quantize = True
        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
    else:
        raise ValueError('Unrecognized Quantization mode!')

    tflite_model = converter.convert()
    with open(out_path, "wb") as f:
        f.write(tflite_model)

In [29]:
test_models_dir = '/scratch/sk7898/test_models'
if not os.path.isdir(test_models_dir):
        os.makedirs(test_models_dir)
            
model_names = ['16000_64_320_1024_half',\
               '16000_64_320_1024_same',\
               '8000_64_160_1024_same',\
               '8000_64_160_1024_half',\
               '16000_64_160_1024_same']

for name in model_names:
    if name == '16000_64_320_1024_half': 
        samp_rate = 16000
        n_mels = 64
        n_hop = 320
        n_dft = 1024
        halved_convs = True
    elif name == '16000_64_320_1024_same':
        samp_rate = 16000
        n_mels = 64
        n_hop = 320
        n_dft = 1024
        halved_convs = False    
    elif name == '8000_64_160_1024_same':
        samp_rate = 8000
        n_mels = 64
        n_hop = 160
        n_dft = 1024
        halved_convs = False    
    elif name == '8000_64_160_1024_half':
        samp_rate = 8000
        n_mels = 64
        n_hop = 160
        n_dft = 1024
        halved_convs = True     
    else:
        samp_rate = 16000
        n_mels = 64
        n_hop = 160
        n_dft = 1024
        halved_convs = False

    keras.backend.clear_session()
    keras.backend.set_learning_phase(0)
    
    if halved_convs:
        input_repr = str(samp_rate)+'_'+str(n_mels)+'_'+str(n_hop)+'_'+str(n_dft)+'_half'
    else:
        input_repr = str(samp_rate)+'_'+str(n_mels)+'_'+str(n_hop)+'_'+str(n_dft)

    model_output_path = os.path.join(test_models_dir, 'test_l3_audio_{}.h5'.format(input_repr))
    audio_spec_embed_model, x_a, _ = construct_cnn_L3_melspec2_spec_model(n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, \
                                                                        halved_convs=halved_convs, asr=samp_rate)
    print('Model Representation: ', name)
    print('Input Shape: ', x_a.shape)
    audio_spec_embed_model.summary()
    audio_spec_embed_model.save(model_output_path)

Model Representation:  16000_64_320_1024_half
Input Shape:  (?, 64, 51, 1)
Model: "audio_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64, 51, 1)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64, 51, 1)         4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 51, 32)        320       
_________________________________________________________________
batch_normalization_2 (Batch (None, 64, 51, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 64, 51, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 51, 32)        9248      
______________________________________________

Model Representation:  8000_64_160_1024_half
Input Shape:  (?, 64, 51, 1)
Model: "audio_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64, 51, 1)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64, 51, 1)         4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 51, 32)        320       
_________________________________________________________________
batch_normalization_2 (Batch (None, 64, 51, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 64, 51, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 51, 32)        9248      
_______________________________________________

In [30]:
input_shapes = [(1, 64, 51, 1), (1, 64, 51, 1), (1, 64, 101, 1)]
test_models_dir = '/scratch/sk7898/test_models'
keras_models = ['test_l3_audio_8000_64_160_1024.h5', 'test_l3_audio_8000_64_160_1024_half.h5', 'test_l3_audio_16000_64_160_1024.h5']

for input_shape, model in zip(input_shapes, keras_models):
    input_repr = model.strip('.h5')
    keras_model_path = os.path.join(test_models_dir, model)
    quant_output_path = os.path.join(test_models_dir, 'quant_{}.tflite'.format(input_repr))

    calibration_steps = 1
    quant_mode = 'default'

    post_training_quantization(keras_model_path, out_path=quant_output_path, quant_mode=quant_mode, \
                               input_shape=input_shape, calibration_steps=calibration_steps)

INFO:tensorflow:Froze 48 variables.
INFO:tensorflow:Converted 48 variables to const ops.
INFO:tensorflow:Froze 48 variables.
INFO:tensorflow:Converted 48 variables to const ops.
INFO:tensorflow:Froze 48 variables.
INFO:tensorflow:Converted 48 variables to const ops.
