In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
%cd /scratch/sk7898/l3embedding
import os
import numpy as np
import random
import librosa
import h5py
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, Flatten, Activation, Lambda
import keras.regularizers as regularizers
from keras.optimizers import Adam
from l3embedding.audio import pcm2float
from resampy import resample
import pescador
from skimage import img_as_float

/scratch/sk7898/l3embedding


Using TensorFlow backend.


In [3]:
def shuffle_files(iterable):
    lst = list(iterable)
    random.shuffle(lst)
    return iter(lst)

def amplitude_to_db(S, amin=1e-10, dynamic_range=80.0):
    magnitude = np.abs(S)
    power = np.square(magnitude, out=magnitude)
    ref_value = power.max()

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= log_spec.max()

    log_spec = np.maximum(log_spec, -dynamic_range)
    return log_spec

def get_melspectrogram(frame, n_fft=2048, mel_hop_length=242, samp_rate=48000, n_mels=256, fmax=None):
    S = np.abs(librosa.core.stft(frame, n_fft=n_fft, hop_length=mel_hop_length, window='hann', center=True, pad_mode='constant'))
    S = librosa.feature.melspectrogram(sr=samp_rate, S=S, n_fft=n_fft, n_mels=n_mels, fmax=fmax, power=1.0, htk=True)
    S = amplitude_to_db(np.array(S))
    return S

In [4]:
def quant_data_generator(data_dir, batch_size=512, samp_rate=48000, n_fft=2048, \
                         n_mels=256, mel_hop_length=242, hop_size=0.1, fmax=None,\
                         random_state=None, start_batch_idx=None):

    if random_state:
        random.seed(random_state)
        
    frame_length = samp_rate * 1

    batch = None
    curr_batch_size = 0
    batch_idx = 0
       
    for fname in shuffle_files(os.listdir(data_dir)):
        print(fname)
        data_batch_path = os.path.join(data_dir, fname)
        blob_start_idx = 0

        data_blob = h5py.File(data_batch_path, 'r')
        blob_size = len(data_blob['audio'])

        while blob_start_idx < blob_size:
            blob_end_idx = min(blob_start_idx + batch_size - curr_batch_size, blob_size)

            # If we are starting from a particular batch, skip computing all of
            # the prior batches
            if start_batch_idx is None or batch_idx >= start_batch_idx:
                if batch is None:
                    batch = data_blob['audio'][blob_start_idx:blob_end_idx]
                else:
                    batch = np.concatenate([batch, data_blob['audio'][blob_start_idx:blob_end_idx]])

            curr_batch_size += blob_end_idx - blob_start_idx
            blob_start_idx = blob_end_idx

            if blob_end_idx == blob_size:
                data_blob.close()

            if curr_batch_size == batch_size:
                X = []
                # If we are starting from a particular batch, skip yielding all
                # of the prior batches
                if start_batch_idx is None or batch_idx >= start_batch_idx:
                    # Convert audio to float
                    if(samp_rate==48000):
                        batch = pcm2float(batch, dtype='float32')
                    else:
                        batch = resample(pcm2float(batch, dtype='float32'), sr_orig=48000,
                                                  sr_new=samp_rate)

                    X = [get_melspectrogram(batch[i].flatten(), n_fft=n_fft, mel_hop_length=mel_hop_length,\
                                            samp_rate=samp_rate, n_mels=n_mels, fmax=fmax) for i in range(batch_size)]

                    batch = np.array(X)[:, :, :, np.newaxis]
                    #print(np.shape(batch)) #(64, 256, 191, 1)
                    return batch

                batch_idx += 1
                curr_batch_size = 0
                batch = None

def single_epoch_test_data_generator(data_dir, epoch_size, **kwargs):
    for _ in range(epoch_size):
        x = quant_data_generator(data_dir, **kwargs)
        yield x

In [5]:
def get_weight_stats(model):
    for layer in model.layers:
        if len(layer.get_weights()) > 0:
            weights = np.array(layer.get_weights())
            print('Min: {} Max: {}'.format(np.min(weights), np.max(weights)))

In [6]:
# Used only if there is no Batch Normalization after the Input layer
def construct_cnn_L3_melspec2_spec_model(n_mels=256, n_hop = 242, n_dft = 2048,
                                         fmax=None, asr = 48000, halved_convs=False, audio_window_dur = 1):
    """
    Constructs a model that replicates the audio subnetwork  used in Look,
    Listen and Learn
    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
    Returns
    -------
    model:  L3 CNN model
            (Type: keras.models.Model)
    inputs: Model inputs
            (Type: list[keras.layers.Input])
    outputs: Model outputs
            (Type: keras.layers.Layer)
    """
    weight_decay = 1e-5

    n_frames = 1 + int((asr * audio_window_dur) / float(n_hop))
    x_a = Input(shape=(n_mels, n_frames, 1), dtype='float32')
    #y_a = BatchNormalization()(x_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64
    if halved_convs:
        n_filter_a_1 //= 2

    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(x_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128
    if halved_convs:
        n_filter_a_2 //= 2

    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256
    if halved_convs:
        n_filter_a_3 //= 2

    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512
    if halved_convs:
        n_filter_a_4 //= 2

    filt_size_a_4 = (3, 3)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer', padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    
    m = Model(inputs=x_a, outputs=y_a)
    m.name = 'audio_model'

    return m, x_a, y_a

In [7]:
def keras_for_tflite(model_path, n_mels=256, n_hop=242, n_dft=2048,
                     asr=48000, fmax=None, halved_convs=False):
    
    keras.backend.clear_session()
    keras.backend.set_learning_phase(0)
    
    l3model = tf.keras.models.load_model(model_path) #keras.models.load_model(model_path)

    n_model, _ , _ = construct_cnn_L3_melspec2_spec_model(n_mels=n_mels, n_hop=n_hop, \
                                                          n_dft=n_dft, asr=asr, fmax=fmax,\
                                                          halved_convs=halved_convs, audio_window_dur=1)
    for idx, layer in enumerate(l3model.layers):
        if idx == 0:
            n_model.layers[idx].set_weights(l3model.get_layer(layer.name).get_weights())
        if idx!=0 and idx != 1:
            n_model.layers[idx-1].set_weights(l3model.get_layer(layer.name).get_weights())
    
    return n_model 

In [8]:
def quantize_keras_to_tflite(tflite_model_file, keras_model, quant_mode='default',\
                             n_mels=256, n_hop=242, n_dft=2048, asr=48000, halved_convs=False,\
                             quant_type='int8', calibrate_data_dir=None, calibration_steps=1024):

    def representative_dataset_gen():
        print('Calibrating.........')
        for _ in range(calibration_steps):
            x = quant_data_generator(calibrate_data_dir, batch_size=1,\
                                     samp_rate=asr, n_fft=n_dft, n_mels=n_mels,\
                                     mel_hop_length=n_hop)
            yield [np.array(x).astype(np.float32)]
    
    converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
    converter.experimental_enable_mlir_converter = False
    #converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_model_path)
 
    if quant_mode == 'default' or quant_mode == 'latency':
        if calibrate_data_dir is None:
            raise ValueError('Quantized activation calibration needs data directory!')

        if quant_mode == 'default':
            converter.optimizations = [tf.lite.Optimize.DEFAULT]
        else:
            converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY]

        if quant_type == 'int8':
            converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]  
        else:
            converter.target_spec.supported_types = [tf.float16]
            
        converter.representative_dataset = representative_dataset_gen
                
    elif quant_mode == 'size':
        if quant_type == 'float16':
            converter.target_spec.supported_types = [tf.float16]
        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
        
    else:
        raise ValueError('Unrecognized Quantization mode!')

    tflite_model = converter.convert()
    with open(tflite_model_file, "wb") as f:
        f.write(tflite_model)

In [9]:
def post_training_quantization(model_path, calibrate_data_dir, quant_mode='default',\
                               n_mels=256, n_hop=242, n_dft=2048, asr=48000, halved_convs=False,\
                               quant_type='int8', calibration_steps=1024):
    
    #1. Convert l3model to keras model for quantization (with maxpooling layer but flatten removed)
    dir_prefix = '/scratch/sk7898/quantization/' + os.path.basename(model_path).strip('.h5')
    
    if not os.path.isdir(dir_prefix):
        os.makedirs(dir_prefix)
    
#     keras_model = keras_for_tflite(model_path, n_mels=n_mels, n_hop=n_hop,
#                                    n_dft=n_dft, asr=asr, halved_convs=halved_convs)
#     keras_model.save(os.path.join(dir_prefix, 'for_quant_no_bn.h5')) 

    keras_model = tf.keras.models.load_model(model_path)
    #keras_model.summary()

    print('Quantizing keras model and saving as tflite')
    #quant_op_type = '_uint8' if quantized_op else ''
    tflite_model_file = os.path.join(dir_prefix, 
                                     'test_quantized_'+ quant_mode + '_'+ quant_type + '.tflite')
    
    quantize_keras_to_tflite(tflite_model_file, keras_model, quant_mode=quant_mode,\
                             quant_type=quant_type, asr=asr,\
                             n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, halved_convs=halved_convs, \
                             calibrate_data_dir=calibrate_data_dir, calibration_steps=calibration_steps)
    
    return tflite_model_file

In [10]:
#model_path = '/scratch/sk7898/l3pruning/embedding/fixed/reduced_input/l3_audio_original_48000_256_242_2048.h5'
#model_path = '/scratch/dr2915/l3pruning/embedding/fixed/reduced_input/l3_audio_20191108201753_8000_64_160_1024_half.h5'
model_path = '/scratch/sk7898/models/reduced_input/embedding/environmental/audio_models/l3_audio_20200304152812_8000_64_160_1024_half.h5'
calibrate_data_dir = '/beegfs/work/AudioSetSamples_environmental/environmental_train'
calibration_steps = 5

quant_mode = 'size' #Options: {'size', 'default', 'latency'}
n_mels = 64
n_hop = 160
n_dft = 1024
asr = 8000
halved_convs=True if 'half' in model_path else False
quant_type = 'int8'

quant_output_path = post_training_quantization(model_path, calibrate_data_dir, quant_mode=quant_mode,
                                               n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, asr=asr,
                                               halved_convs=halved_convs, quant_type=quant_type,
                                               calibration_steps=calibration_steps)

Quantizing keras model and saving as tflite


**Input/Output of tflite model (Interpreter)**

In [11]:
interpreter = tf.lite.Interpreter(model_path=str(quant_output_path))
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_shape = input_details[0]['shape'][1:]
output_shape = output_details[0]['shape'][1:]
input_index = input_details[0]['index']
output_index = output_details[0]['index']

interpreter.allocate_tensors()

print("== Input details ==")
print(interpreter.get_input_details()[0])
print("type:", input_details[0]['dtype'])
print("\n== Output details ==")
print(interpreter.get_output_details()[0])

== Input details ==
{'name': 'input_3', 'index': 35, 'shape': array([ 1, 64, 51,  1], dtype=int32), 'shape_signature': array([ 1, 64, 51,  1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
type: <class 'numpy.float32'>

== Output details ==
{'name': 'Identity', 'index': 36, 'shape': array([  1, 256], dtype=int32), 'shape_signature': array([  1, 256], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}


**Generate Embedding from the tflite model**

In [None]:
def gen_embedding(model_path, tflite_model_file, data_dir, quant_mode='default',\
                  emb_len=512, batch_size=64, epoch_size=1024):
    
    output = None
    print('Getting embedding out of Quantized tflite model')
    splits = os.path.basename(model_path).strip('.h5').split('_')
    samp_rate = int(splits[3])
    n_mels = int(splits[4])
    mel_hop_length = int(splits[5])
    n_fft = int(splits[-1])
    
    data_gen = single_epoch_test_data_generator(data_dir, epoch_size,\
                                                batch_size=batch_size, samp_rate=samp_rate,\
                                                n_fft=n_fft, n_mels=n_mels, mel_hop_length=mel_hop_length)

    output = get_embeddings_batch_from_tflite(data_gen, tflite_model_file,\
                                              epoch_size, batch_size, emb_len=emb_len)
    return output

In [None]:
def get_embeddings_batch_from_tflite(data_gen, tflite_model_file, epoch_size, batch_size, emb_len=512):
    
    predictions = np.zeros(shape=(epoch_size, batch_size, emb_len))
    interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    input_shape = input_details[0]['shape'][1:]
    output_shape = output_details[0]['shape'][1:]
    input_index = input_details[0]['index']
    output_index = output_details[0]['index']

    interpreter.resize_tensor_input(input_index, ((batch_size, ) + tuple(input_shape)))
    interpreter.resize_tensor_input(output_index, ((batch_size, ) + tuple(input_shape)))
    interpreter.allocate_tensors()
    
    print("== Input details ==")
    print(interpreter.get_input_details()[0])
    print("type:", input_details[0]['dtype'])
    print("\n== Output details ==")
    print(interpreter.get_output_details()[0])
                
    #predictions per batch   
    for idx, batch_x in enumerate(data_gen):
        x = np.array(batch_x).astype(np.float32)
        interpreter.set_tensor(input_index, x)
        interpreter.invoke()
        output = interpreter.get_tensor(output_index)
        flattened_output = np.reshape(output, (output.shape[0], output.shape[-1]))
        predictions[idx, :, :] = flattened_output
        
    return predictions

In [None]:
model_path = '/scratch/sk7898/l3pruning/embedding/fixed/reduced_input/l3_audio_original_48000_256_242_2048.h5'
tflite_model_file = '/scratch/sk7898/quantization/l3_audio_original_48000_256_242_2048/quantized_model_default.tflite'
data_dir = '/beegfs/work/AudioSetSamples/music_train'

embeddings = gen_embedding(model_path, tflite_model_file, data_dir, quant_mode='default',\
                           batch_size=64, epoch_size=64)