In [30]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
%cd /scratch/sk7898/l3embedding
import os
import numpy as np
import random
import librosa
import h5py
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, Flatten, Activation, Lambda
import keras.regularizers as regularizers
from keras.optimizers import Adam
from l3embedding.audio import pcm2float
from l3embedding.model import MODELS, load_model
from resampy import resample
import pescador
from skimage import img_as_float
from keras import backend as K

/scratch/sk7898/l3embedding


In [38]:
def data_generator(data_dir, batch_size=512, random_state=20180123, samp_rate=48000,
                   start_batch_idx=None, keys=None):
    
    random.seed(random_state)

    batch = None
    curr_batch_size = 0
    batch_idx = 0

    # Limit keys to avoid producing batches with all of the metadata fields
    if not keys:
        keys = ['audio', 'video', 'label']

    for fname in shuffle_files(os.listdir(data_dir)):
        print(fname)
        batch_path = os.path.join(data_dir, fname)
        blob_start_idx = 0

        blob = h5py.File(batch_path, 'r')
        blob_size = len(blob['label'])

        while blob_start_idx < blob_size:
            blob_end_idx = min(blob_start_idx + batch_size - curr_batch_size, blob_size)

            # If we are starting from a particular batch, skip computing all of
            # the prior batches
            if start_batch_idx is None or batch_idx >= start_batch_idx:
                if batch is None:
                    batch = {k:blob[k][blob_start_idx:blob_end_idx]
                             for k in keys}
                else:
                    for k in keys:
                        batch[k] = np.concatenate([batch[k],
                                                   blob[k][blob_start_idx:blob_end_idx]])

            curr_batch_size += blob_end_idx - blob_start_idx
            blob_start_idx = blob_end_idx

            if blob_end_idx == blob_size:
                blob.close()

            if curr_batch_size == batch_size:
                # If we are starting from a particular batch, skip yielding all
                # of the prior batches
                if start_batch_idx is None or batch_idx >= start_batch_idx:
                    # Preprocess video so samples are in [-1,1]
                    batch['video'] = 2 * img_as_float(batch['video']).astype('float32') - 1

                    # Convert audio to float
                    if(samp_rate==48000):
                        batch['audio'] = pcm2float(batch['audio'], dtype='float32')
                    else:
                        batch['audio'] = resample(pcm2float(batch['audio'], dtype='float32'), sr_orig=48000,
                                                  sr_new=samp_rate)
                    #print('Shape of audio batch:', np.shape(batch['audio']))
                    yield batch

                batch_idx += 1
                curr_batch_size = 0
                batch = None


def single_epoch_data_generator(data_dir, epoch_size, **kwargs):
    while True:
        data_gen = data_generator(data_dir, **kwargs)
        for idx, item in enumerate(data_gen):
            yield item
            # Once we generate all batches for an epoch, restart the generator
            if (idx + 1) == epoch_size:
                break

In [39]:
def quantize_keras_to_tflite(tflite_model_file, keras_model, keras_model_path, quant_mode='default',
                             quantized_input=True, n_mels=256, n_hop=242, n_dft=2048, asr=48000, 
                             halved_convs=False, calibrate_data_dir=None, num_calibration_steps=1024, tf_version=2):

    def representative_dataset_gen():
            print('Calibrating.........')
            for _ in range(num_calibration_steps):
                data_gen = quant_data_generator(calibrate_data_dir, batch_size=1)
                data_gen = pescador.maps.keras_tuples(data_gen,
                                                       ['video', 'audio'],
                                                       'label')
                yield data_gen
    
    if tf_version == 2:
        converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
    else:
        converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_model_path)
    
    if quant_mode == 'default':
        if calibrate_data_dir is None:
            raise ValueError('Quantized activation calibration needs data directory!')
        
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        if quantized_input:
            #converter.inference_input_type = tf.int8
            converter.inference_output_type = tf.int8
        #converter.default_ranges_stats = (0, 1)
        converter.representative_dataset = representative_dataset_gen
                
    elif quant_mode == 'size':
        converter.post_training_quantize = True
        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
    else:
        raise ValueError('Unrecognized Quantization mode!')

    tflite_model = converter.convert()
    with open(tflite_model_file, "wb") as f:
        f.write(tflite_model)
    print('Tflite model saved in:', tflite_model_file)

In [40]:
def post_training_quantization(model_path, calibrate_data_dir, quant_mode='default', quantized_input=False,\
                               n_mels=256, n_hop=242, n_dft=2048, asr=48000, halved_convs=False,\
                               flatten=False, calibration_steps=1024):
    
    #1. Convert l3model to keras model for quantization (with maxpooling layer but flatten removed)
    dir_prefix = '/scratch/sk7898/quantization/' + os.path.basename(model_path).strip('.h5')
    
    if not os.path.isdir(dir_prefix):
        os.makedirs(dir_prefix)
    
    keras_model, inputs, outputs = load_model(model_path, 'cnn_L3_melspec2', return_io=True, src_num_gpus=1,
                                              n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, asr=asr,
                                              halved_convs=halved_convs)
    #print(keras_model.summary())
    
    print('Quantizing keras model and saving as tflite')
    input_type = '_int8Ip' if quantized_input else ''
    tflite_model_file = os.path.join(dir_prefix, 'full_quantized_'+ quant_mode + input_type + '.tflite')
    
    quantize_keras_to_tflite(tflite_model_file, model_path, quant_mode=quant_mode,\
                             quantized_input=quantized_input, asr=asr,\
                             n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, halved_convs=halved_convs, \
                             calibrate_data_dir=calibrate_data_dir, num_calibration_steps=calibration_steps)

In [41]:
model_path = '/scratch/sk7898/l3embedding/models/cnn_l3_melspec2_recent/model_best_valid_accuracy.h5'
calibrate_data_dir = '/beegfs/work/AudioSetSamples_environmental/environmental_train'
calibration_steps = 32

quant_mode='default'
flatten=True
quantized_input=True
n_mels=256
n_hop=242
n_dft=2048
asr=48000
halved_convs=False

# post_training_quantization(model_path, calibrate_data_dir, quant_mode=quant_mode, quantized_input=quantized_input,\
#                            n_mels=n_mels, n_hop=n_hop, n_dft=n_dft, asr=asr, halved_convs=halved_convs,\
#                            flatten=flatten, calibration_steps=calibration_steps)

tracking <tf.Variable 'melspectrogram_3/real_kernels:0' shape=(2048, 1, 1, 1025) dtype=float32> dft_real_kernels
tracking <tf.Variable 'melspectrogram_3/imag_kernels:0' shape=(2048, 1, 1, 1025) dtype=float32> dft_imag_kernels
tracking <tf.Variable 'melspectrogram_3/Variable:0' shape=(1025, 256) dtype=float32> freq2mel


ValueError: axes don't match array

In [34]:
blob = h5py.File(model_path, 'r')
#print(blob.keys()) ['audio_model', 'concatenate_96', 'dense_127', 'dense_128', 'input_127', 'input_128', 'vision_model']
m, inputs, outputs = MODELS['cnn_L3_melspec2'](n_mels=256, n_hop=242, n_dft=2048, asr=48000, num_gpus=1)
print(m.summary())

tracking <tf.Variable 'melspectrogram_2/real_kernels:0' shape=(2048, 1, 1, 1025) dtype=float32> dft_real_kernels
tracking <tf.Variable 'melspectrogram_2/imag_kernels:0' shape=(2048, 1, 1, 1025) dtype=float32> dft_imag_kernels
tracking <tf.Variable 'melspectrogram_2/Variable:0' shape=(1025, 256) dtype=float32> freq2mel




Model: "cnn_L3_kapredbinputbn"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1, 48000)     0                                            
__________________________________________________________________________________________________
vision_model (Model)            (None, 512)          4693068     input_3[0][0]                    
__________________________________________________________________________________________________
audio_model (Model)             (None, 512)          9152708     input_4[0][0]                    
______________________________________________________________________________

In [44]:
for layer in blob['vision_model']:
    weight_values = [np.asarray(blob['vision_model'][layer][grp]) for grp in blob['vision_model'][layer]]

4
-----
4
-----
4
-----
4
-----
4
-----
4
-----
4
-----
4
-----
4
-----
2
-----
2
-----
2
-----
2
-----
2
-----
2
-----
2
-----
2
-----


In [33]:
for layer in sorted(m.get_layer('vision_model').layers, key=lambda x: x.name):
    symbolic_weights = layer.weights
    weight_values = K.batch_get_value(symbolic_weights)
    if len(weight_values) > 0:
        print(layer.name)
        print

batch_normalization_1
batch_normalization_2
batch_normalization_3
batch_normalization_4
batch_normalization_5
batch_normalization_6
batch_normalization_7
batch_normalization_8
batch_normalization_9
conv2d_1
conv2d_2
conv2d_3
conv2d_4
conv2d_5
conv2d_6
conv2d_7
vision_embedding_layer


In [21]:
def load_attributes_from_hdf5_group(group, name):
      """Loads attributes of the specified name from the HDF5 group.
      This method deals with an inherent problem
      of HDF5 file which is not able to store
      data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
      Arguments:
          group: A pointer to a HDF5 group.
          name: A name of the attributes to load.
      Returns:
          data: Attributes data.
      """
        
    if name in group.attrs:
        data = [n.decode('utf8') for n in group.attrs[name]]
    else:
        data = []
        chunk_id = 0
        while '%s%d' % (name, chunk_id) in group.attrs:
            data.extend([n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
            chunk_id += 1
    return data

def convert_nested_model(weights):
    """Converts layers nested in `Model` or `Sequential`.
    This function uses `preprocess_weights_for_loading()` for converting nested
    layers.
    Arguments:
        weights: List of weights values (Numpy arrays).
    Returns:
        A list of weights values (Numpy arrays).
    """
    trainable_weights = weights[:len(layer.trainable_weights)]
    non_trainable_weights = weights[len(layer.trainable_weights):]

    new_trainable_weights = []
    new_non_trainable_weights = []

    for sublayer in layer.layers:
        num_trainable_weights = len(sublayer.trainable_weights)
        num_non_trainable_weights = len(sublayer.non_trainable_weights)
        if sublayer.weights:
            preprocessed = preprocess_weights_for_loading(layer=sublayer,
                                                          weights=(trainable_weights[:num_trainable_weights] +
                                                          non_trainable_weights[:num_non_trainable_weights]),
                                                          original_keras_version=original_keras_version,
                                                          original_backend=original_backend)
            new_trainable_weights.extend(preprocessed[:num_trainable_weights])
            new_non_trainable_weights.extend(preprocessed[num_trainable_weights:])

            trainable_weights = trainable_weights[num_trainable_weights:]
            non_trainable_weights = non_trainable_weights[num_non_trainable_weights:]

    return new_trainable_weights + new_non_trainable_weights

for layer in m.layers:
    if layer.__class__.__name__ in ['Model', 'Sequential']: 
        print(layer.name)
        weights = convert_nested_model(weights)

vision_model
audio_model


In [31]:
group = blob
data = []
names = 'layer_names'
chunk_id = 0
while '%s%d' % (name, chunk_id) in group.attrs:
    data.extend([n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
    chunk_id += 1