In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import os
import numpy as np
import random
import librosa
import h5py
import tensorflow as tf
import keras
from keras.optimizers import Adam
from l3embedding.audio import pcm2float
from resampy import resample
import pescador
from skimage import img_as_float
from tqdm import tqdm

Using TensorFlow backend.


In [3]:
def shuffle_files(iterable):
    lst = list(iterable)
    random.shuffle(lst)
    return iter(lst)

In [4]:
def construct_cnn_L3_melspec2_spec_model(n_mels=256, n_hop = 242, n_dft = 2048,
                                         fmax=None, asr = 48000, halved_convs=False, audio_window_dur = 1):
    
    weight_decay = 1e-5

    n_frames = 1 + int((asr * audio_window_dur) / float(n_hop))
    x_a = Input(shape=(n_mels, n_frames, 1), dtype=np.float32)
    y_a = BatchNormalization()(x_a)

    # CONV BLOCK 1
    n_filter_a_1 = np.uint8(64)
    if halved_convs:
        n_filter_a_1 //= 2

    filt_size_a_1 = (np.uint8(3), np.uint8(3))
    pool_size_a_1 = (np.uint8(2), np.uint8(2))
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = np.uint8(128)
    if halved_convs:
        n_filter_a_2 //= 2

    filt_size_a_2 = (np.uint8(3), np.uint8(3))
    pool_size_a_2 = (np.uint8(2), np.uint8(2))
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = np.uint16(256)
    if halved_convs:
        n_filter_a_3 //= 2

    filt_size_a_3 = (np.uint8(3), np.uint8(3))
    pool_size_a_3 = (np.uint8(2), np.uint8(2))
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    
    if y_a.shape[-2] >= 2:
        y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = np.uint16(512)
    if halved_convs:
        n_filter_a_4 //= 2

    filt_size_a_4 = (3, 3)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer', padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    
    m = Model(inputs=x_a, outputs=y_a)
    m.name = 'audio_model'

    return m, x_a, y_a

In [5]:
def amplitude_to_db(S, amin=1e-10, dynamic_range=80.0):
    magnitude = np.abs(S)
    power = np.square(magnitude, out=magnitude)
    ref_value = power.max()

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= log_spec.max()

    log_spec = np.maximum(log_spec, -dynamic_range)
    return log_spec

In [6]:
def get_melspectrogram(frame, n_fft=2048, mel_hop_length=242, samp_rate=48000, n_mels=256, fmax=None):
    S = np.abs(librosa.core.stft(frame, n_fft=n_fft, hop_length=mel_hop_length, window='hann', center=True, pad_mode='constant'))
    S = librosa.feature.melspectrogram(sr=samp_rate, S=S, n_fft=n_fft, n_mels=n_mels, fmax=fmax, power=1.0, htk=True)
    S = amplitude_to_db(np.array(S))
    return S

In [7]:
def quant_data_generator(data_dir, batch_size=512, samp_rate=48000, n_fft=2048, \
                         n_mels=256, mel_hop_length=242, hop_size=0.1, fmax=None,\
                         random_state=None, start_batch_idx=None):

    if random_state:
        random.seed(random_state)
        
    frame_length = samp_rate * 1

    batch = None
    curr_batch_size = 0
    batch_idx = 0
       
    for fname in shuffle_files(os.listdir(data_dir)):
        print(fname)
        data_batch_path = os.path.join(data_dir, fname)
        blob_start_idx = 0

        data_blob = h5py.File(data_batch_path, 'r')
        blob_size = len(data_blob['audio'])

        while blob_start_idx < blob_size:
            blob_end_idx = min(blob_start_idx + batch_size - curr_batch_size, blob_size)

            # If we are starting from a particular batch, skip computing all of
            # the prior batches
            if start_batch_idx is None or batch_idx >= start_batch_idx:
                if batch is None:
                    batch = data_blob['audio'][blob_start_idx:blob_end_idx]
                else:
                    batch = np.concatenate([batch, data_blob['audio'][blob_start_idx:blob_end_idx]])

            curr_batch_size += blob_end_idx - blob_start_idx
            blob_start_idx = blob_end_idx

            if blob_end_idx == blob_size:
                data_blob.close()

            if curr_batch_size == batch_size:
                X = []
                # If we are starting from a particular batch, skip yielding all
                # of the prior batches
                if start_batch_idx is None or batch_idx >= start_batch_idx:
                    # Convert audio to float
                    if(samp_rate==48000):
                        batch = pcm2float(batch, dtype='float32')
                    else:
                        batch = resample(pcm2float(batch, dtype='float32'), sr_orig=48000,
                                                  sr_new=samp_rate)

                    X = [get_melspectrogram(batch[i].flatten(), n_fft=n_fft, mel_hop_length=mel_hop_length,\
                                            samp_rate=samp_rate, n_mels=n_mels, fmax=fmax) for i in range(batch_size)]

                    batch = np.array(X)[:, :, :, np.newaxis]
                    #print(np.shape(batch)) #(64, 256, 191, 1)
                    return batch

                batch_idx += 1
                curr_batch_size = 0
                batch = None

In [8]:
def single_epoch_test_data_generator(data_dir, epoch_size, **kwargs):
    for _ in range(epoch_size):
        x = quant_data_generator(data_dir, **kwargs)
        yield x

In [9]:
def keras_for_tflite(model_path, output_path):
    keras.backend.clear_session()
    keras.backend.set_learning_phase(0)
    
    l3model = keras.models.load_model(model_path)
    n_model = construct_cnn_L3_melspec2_spec_model(n_mels=256, n_hop=242, n_dft=2048,\
                                                 fmax=None, asr=48000, halved_convs=False, audio_window_dur=1)
    n_model.set_weights(l3model.get_weights)
    
    embed_layer = l3model.get_layer('audio_embedding_layer')
    pool_size = tuple(embed_layer.get_output_shape_at(0)[1:3])
    
    y_a = keras.layers.MaxPooling2D(pool_size=pool_size, padding='same')(n_model.output)    
    model = keras.models.Model(inputs=n_model.input, outputs=y_a)
    model.save(output_path)

    return model 

In [10]:
def quantize_keras_to_tflite(tflite_model_file, keras_model_path, quant_mode='default',\
                             calibrate_data_dir=None, num_calibration_steps=1024):

    def representative_dataset_gen():
            l3_model = os.path.dirname(tflite_model_file)
            splits = l3_model.split('_')
            samp_rate = int(splits[3])
            n_mels = int(splits[4])
            mel_hop_length = int(splits[5])
            n_fft = int(splits[-1])

            print('Calibrating.........')
            for _ in range(num_calibration_steps):
                x = quant_data_generator(calibrate_data_dir, batch_size=1,\
                                         samp_rate=samp_rate, n_fft=n_fft,\
                                         n_mels=n_mels, mel_hop_length=mel_hop_length)
                yield [np.array(x).astype(np.float32)]
                
    converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_model_path)
    
    if quant_mode == 'default':
        if calibrate_data_dir is None:
            raise ValueError('Quantized activation calibration needs data directory!')
        
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.default_ranges_stats = (0, 1)
        converter.representative_dataset = representative_dataset_gen
        
    elif quant_mode == 'size':
        converter.post_training_quantize = True
        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
    else:
        raise ValueError('Unrecognized Quantization mode!')

    tflite_model = converter.convert()
    with open(tflite_model_file, "wb") as f:
        f.write(tflite_model)

In [11]:
def get_embeddings_batch_from_tflite(data_gen, tflite_model_file, epoch_size, batch_size, emb_len=512):
    
    predictions = np.zeros(shape=(epoch_size, batch_size, emb_len))
    #original_embeddings = np.zeros(shape=(epoch_size, batch_size, emb_len))

    interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    input_shape = input_details[0]['shape'][1:]
    output_shape = output_details[0]['shape'][1:]
    input_index = input_details[0]['index']
    output_index = output_details[0]['index']

    interpreter.resize_tensor_input(input_index, ((batch_size, ) + tuple(input_shape)))
    interpreter.resize_tensor_input(output_index, ((batch_size, ) + tuple(input_shape)))
    interpreter.allocate_tensors()
    
    print("== Input details ==")
    print(interpreter.get_input_details()[0])
    print("type:", input_details[0]['dtype'])
    print("\n== Output details ==")
    print(interpreter.get_output_details()[0])
                
    #predictions per batch   
    for idx, batch_x in enumerate(data_gen):
        x = np.array(batch_x).astype(np.float32)
        interpreter.set_tensor(input_index, x)
        interpreter.invoke()
        output = interpreter.get_tensor(output_index)
        flattened_output = np.reshape(output, (output.shape[0], output.shape[-1]))
        predictions[idx, :, :] = flattened_output
        
    return predictions

In [12]:
def gen_embedding(model_path, tflite_model_file, data_dir, quant_mode='default',\
                  emb_len=512, batch_size=64, epoch_size=1024):
    
    output = None
    print('Getting embedding out of Quantized tflite model')
    splits = os.path.basename(model_path).strip('.h5').split('_')
    samp_rate = int(splits[3])
    n_mels = int(splits[4])
    mel_hop_length = int(splits[5])
    n_fft = int(splits[-1])
    
    data_gen = single_epoch_test_data_generator(data_dir, epoch_size,\
                                                batch_size=batch_size, samp_rate=samp_rate,\
                                                n_fft=n_fft, n_mels=n_mels, mel_hop_length=mel_hop_length)

    output = get_embeddings_batch_from_tflite(data_gen, tflite_model_file,\
                                              epoch_size, batch_size, emb_len=emb_len)
    return output

In [13]:
def post_training_quantization(model_path, calibrate_data_dir, quant_mode='default',\
                               calibration_steps=1024):
    
    #1. Convert l3model to keras model for quantization (with maxpooling layer but flatten removed)
    dir_prefix = '/scratch/sk7898/quantization/' + os.path.basename(model_path).strip('.h5')
    
    if not os.path.isdir(dir_prefix):
        os.makedirs(dir_prefix)
    
    #print('Saving keras model for Quantization')
    keras_model_path = os.path.join(dir_prefix, 'for_quantization.h5')    
    #keras_model = keras_for_tflite(model_path, keras_model_path)
    
    #2.1 Convert keras to tflite model
    #2.2 Quantize model with mode 'default' for only weights quantization or 'size' for full quantization
    #2.3 Save the quantized tflite model
    
    print('Quantizing keras model and saving as tflite')
    tflite_model_file = os.path.join(dir_prefix, 'quantized_model_copy_'+ quant_mode + '.tflite')
    
    quantize_keras_to_tflite(tflite_model_file, keras_model_path, quant_mode=quant_mode,\
                             calibrate_data_dir=calibrate_data_dir, num_calibration_steps=calibration_steps)

In [14]:
model_path = '/scratch/sk7898/l3pruning/embedding/fixed/reduced_input/l3_audio_original_48000_256_242_2048.h5'
calibrate_data_dir = '/beegfs/work/AudioSetSamples/music_train'
calibration_steps = 10
quant_mode = 'default'

post_training_quantization(model_path, calibrate_data_dir, quant_mode=quant_mode, \
                           calibration_steps=calibration_steps)

Quantizing keras model and saving as tflite
Instructions for updating:
`normal` is a deprecated alias for `truncated_normal`
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 48 variables.
INFO:tensorflow:Converted 48 variables to const ops.
Calibrating.........
20185094_2_5.h5




20187019_3_42.h5
20180202_4_118.h5
20180145_3_35.h5
20183056_0_11.h5
20180258_4_165.h5
20187045_1_32.h5
20180158_0_107.h5
20180190_0_164.h5
20180177_3_77.h5


In [None]:
model_path = '/scratch/sk7898/l3pruning/embedding/fixed/reduced_input/l3_audio_original_48000_256_242_2048.h5'
tflite_model_file = '/scratch/sk7898/quantization/l3_audio_original_48000_256_242_2048/quantized_model_default.tflite'
data_dir = '/beegfs/work/AudioSetSamples/music_train'

embeddings = gen_embedding(model_path, tflite_model_file, data_dir, quant_mode='default',\
                           batch_size=64, epoch_size=64)

Getting embedding out of Quantized tflite model
== Input details ==
{'name': 'input_13', 'index': 33, 'shape': array([ 64, 256, 199,   1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0)}
type: <class 'numpy.float32'>

== Output details ==
{'name': 'max_pooling2d_1/MaxPool', 'index': 34, 'shape': array([ 64,   1,   1, 512], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0)}
20183047_3_31.h5


