In [4]:
# Define test files
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import gzip
import random
import vggish_input
import vggish_postprocess
import vggish_params
import vggish_slim
import tensorflow as tf
import torchvggish
import torch 


def example_generator(base_path):
    """
    Emits random examples from partition of audioset
    """
    eval_files = os.listdir(base_path)
    random.shuffle(eval_files)
    while True:
        eg_file = base_path + eval_files.pop(0)
        with gzip.open(eg_file, 'rb') as wav_file:
            yield vggish_input.wavfile_to_examples(wav_file)
            
eval_path = "../../data/audioset_eval/"
data_generator = example_generator(eval_path)
pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz")
examples_batch = next(data_generator)

In [5]:
# Create an embedding with tensorflow vggish
with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt")
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [tf_embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    tf_postprocessed_batch = pproc.postprocess(tf_embedding_batch)
    print(tf_postprocessed_batch)

INFO:tensorflow:Restoring parameters from vggish_model.ckpt
[[186  29 203 ... 164   0 255]
 [187  31 180 ... 128   0 255]
 [192  40 209 ... 111   0 255]
 ...
 [188  36 203 ...   0   0 255]
 [180  22 173 ...   0   0 255]
 [182  27 190 ...   0  47 255]]


In [8]:
#Create an embedding with pytorch vggish
pt_examples_batch = examples_batch[:,None,:,:]
pt_examples_batch = torch.from_numpy(pt_examples_batch).float()

!pwd
net = torchvggish.VGGish()
net.load_state_dict(torch.load("vggish-model.pth"))
pt_embedding_batch = net.forward(pt_examples_batch)
pt_embedding_batch = pt_embedding_batch.data.numpy()

pt_postprocessed_batch = pproc.postprocess(pt_embedding_batch)
print(pt_postprocessed_batch)
pt_postprocessed_batch == tf_postprocessed_batch

/Users/harrisontaylor/Workspace/Research/audio-experiments/torchvggish
[[244   0 158 ... 204 255 255]
 [237   0 160 ... 219 255 255]
 [233   0 157 ...  78 255 255]
 ...
 [233   0 161 ... 244 255 255]
 [234   0 155 ... 255 255 255]
 [240   0 155 ... 255 255 255]]


array([[False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       ...,
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True]])

In [5]:
# Shit. Okay, time to debug...


In [7]:
# Keras Version :-) 

"""VGGish model for Keras. A VGG-like model for audio classification
# Reference
- [CNN Architectures for Large-Scale Audio Classification](ICASSP 2017)
"""



from keras.models import Model
from keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.engine.topology import get_source_inputs
from keras import backend as K


# weight path
WEIGHTS_PATH = './keras_weights.h5'

def KerasVGGish(load_weights=True, weights='audioset',
           input_tensor=None, input_shape=None,
           out_dim=None, pooling='avg'):
    '''
    An implementation of the VGGish architecture.
    :param load_weights: if load weights
    :param weights: loads weights pre-trained on a preliminary version of YouTube-8M.
    :param input_tensor: input_layer
    :param input_shape: input data shape
    :param out_dim: output dimension
    :param include_top:whether to include the 3 fully-connected layers at the top of the network.
    :param pooling: pooling type over the non-top network, 'avg' or 'max'
    :return: A Keras model instance.
    '''

    if weights not in {'audioset', None}:
        raise ValueError('The `weights` argument should be either '
                         '`None` (random initialization) or `audioset` '
                         '(pre-training on audioset).')

    if out_dim is None:
        out_dim = vggish_params.EMBEDDING_SIZE

    # input shape
    if input_shape is None:
        input_shape = (vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS, 1)

    if input_tensor is None:
        aud_input = Input(shape=input_shape, name='input_1')
    else:
        if not K.is_keras_tensor(input_tensor):
            aud_input = Input(tensor=input_tensor, shape=input_shape, name='input_1')
        else:
            aud_input = input_tensor


    # Block 1
    x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='features.0')(aud_input)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='features.2')(x)

    # Block 2
    x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='features.3')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='features.5')(x)

    # Block 3
    x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='features.6')(x)
    x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='features.8')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='features.10')(x)

    # Block 4
    x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='features.11')(x)
    x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='features.13')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='features.15')(x)
    
    x = Flatten(name='flatten_')(x)
    x = Dense(4096, activation='relu', name='embeddings.0')(x)
    x = Dense(4096, activation='relu', name='embeddings.2')(x)
    x = Dense(out_dim, activation='relu', name='embeddings.4')(x)


    if input_tensor is not None:
        inputs = get_source_inputs(input_tensor)
    else:
        inputs = aud_input
    # Create model.
    model = Model(inputs, x, name='VGGish')


    # load weights
    model.load_weights(WEIGHTS_PATH)

    return model

Using TensorFlow backend.


In [8]:
# keras_net = KerasVGGish()
# k_embedding_batch = keras_net.predict(examples_batch[:,:,:,None])
# k_postprocessed_batch = pproc.postprocess(k_embedding_batch)
# k_postprocessed_batch == tf_postprocessed_batch

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [2]:
#Try out the nn-transfer library

from torchvggish import VGGish
import torch

pytorch_network = VGGish()
print(pytorch_network)
pytorch_network.save("torch_net.pth")


ImportError: dlopen(/Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.6/site-packages/torch/_C.cpython-36m-darwin.so, 9): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.6/site-packages/torch/lib/libshm.dylib
  Reason: image not found

In [10]:
keras_network = KerasVGGish()

In [11]:
transfer.keras_to_pytorch(keras_network, pytorch_network)

Layer names in PyTorch state_dict ['features.0', 'features.3', 'features.6', 'features.8', 'features.11', 'features.13', 'embeddings.0', 'embeddings.2', 'embeddings.4']
Layer names in Keras HDF5 ['embeddings.0', 'embeddings.2', 'embeddings.4', 'features.0', 'features.10', 'features.11', 'features.13', 'features.15', 'features.2', 'features.3', 'features.5', 'features.6', 'features.8', 'flatten_', 'input_1']


In [12]:
k_embedding_batch = keras_network.predict(examples_batch[:,:,:,None])
k_postprocessed_batch = pproc.postprocess(k_embedding_batch)
k_postprocessed_batch


array([[162,  19, 169, ...,   0,  44, 255],
       [160,  16, 163, ...,   0, 236, 255],
       [161,  20, 166, ...,   0, 116, 255],
       ...,
       [156,  13, 163, ...,   0, 176, 255],
       [152,  13, 156, ...,  20,  30, 255],
       [149,   0, 158, ...,   0,  61, 255]], dtype=uint8)

In [13]:
pt_examples_batch = examples_batch[:,None,:,:]
pt_examples_batch = torch.from_numpy(pt_examples_batch).float()


pt_embedding_batch = pytorch_network.forward(pt_examples_batch)
pt_embedding_batch = pt_embedding_batch.data.numpy()

pt_postprocessed_batch = pproc.postprocess(pt_embedding_batch)
pt_postprocessed_batch

array([[244,   0, 154, ..., 255, 255, 255],
       [241,   0, 152, ..., 255, 255, 255],
       [247,   0, 147, ..., 255, 255, 255],
       ...,
       [251,   0, 146, ..., 255, 255, 255],
       [242,   0, 151, ..., 255, 255, 255],
       [230,   0, 159, ..., 101, 255, 255]], dtype=uint8)

In [18]:
print(keras_network.summary())
print(pytorch_network)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 96, 64, 1)         0         
_________________________________________________________________
features.0 (Conv2D)          (None, 96, 64, 64)        640       
_________________________________________________________________
features.2 (MaxPooling2D)    (None, 48, 32, 64)        0         
_________________________________________________________________
features.3 (Conv2D)          (None, 48, 32, 128)       73856     
_________________________________________________________________
features.5 (MaxPooling2D)    (None, 24, 16, 128)       0         
_________________________________________________________________
features.6 (Conv2D)          (None, 24, 16, 256)       295168    
_________________________________________________________________
features.8 (Conv2D)          (None, 24, 16, 256)       590080    
__________

tensor([[0.0000e+00, 0.0000e+00, 7.7917e-03, 5.8261e-03, 0.0000e+00, 1.4602e-02,
         1.2129e-02, 0.0000e+00, 9.5376e-03, 0.0000e+00, 0.0000e+00, 5.3788e-03,
         0.0000e+00, 8.2478e-03, 6.0739e-03, 0.0000e+00, 1.6649e-02, 3.0965e-04,
         1.5750e-02, 0.0000e+00, 4.4081e-03, 5.2477e-03, 0.0000e+00, 1.3824e-02,
         6.4700e-03, 8.3422e-03, 0.0000e+00, 0.0000e+00, 5.4149e-03, 0.0000e+00,
         4.4764e-03, 3.5309e-03, 9.0604e-03, 0.0000e+00, 8.9312e-03, 0.0000e+00,
         0.0000e+00, 1.1059e-02, 0.0000e+00, 9.1706e-03, 1.3428e-02, 4.2193e-03,
         1.1189e-02, 0.0000e+00, 4.8923e-04, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         2.4982e-03, 8.8991e-03, 0.0000e+00, 2.7875e-03, 1.5925e-02, 5.8490e-04,
         4.3007e-03, 3.8881e-03, 9.0912e-04, 0.0000e+00, 0.0000e+00, 8.1742e-03,
         0.0000e+00, 1.2945e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.8607e-03,
         1.2168e-02, 0.0000e+00, 0.0000e+00, 1.3794e-02, 1.0272e-02, 1.3082e-02,
         3.2369e-05, 8.8510e

In [23]:
data = torch.rand(1,1,64,96).float()
data_keras = data.numpy().swapaxes(1,3)

pt_result = pytorch_network.forward(data)
ks_result = keras_network.predict(data_keras)
ks_result == pt_result.data.numpy() # false

array([[ True, False, False, False, False, False, False,  True, False,
         True,  True, False,  True, False, False, False, False, False,
        False,  True, False, False, False, False, False, False,  True,
         True, False,  True, False, False, False, False, False, False,
         True, False,  True, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False,
        False, False, False,  True, False, False,  True, False,  True,
         True, False, False, False, False,  True, False, False, False,
        False, False, False, False,  True, False, False, False, False,
        False, False,  True, False, False, False,  True, False, False,
        False, False, False, False, False, False,  True, False, False,
        False,  True, False, False, False, False, False, False,  True,
        False,  True,  True, False, False,  True,  True,  True, False,
        False,  True, False, False, False, False, False,  True,  True,
      

In [24]:
with open("vggish_keras.json", "w") as f:
    f.write(keras_network.to_json())
    

In [25]:
import onnxmltools
onnx_model = onnxmltools.convert_keras(keras_network)

Instructions for updating:
Use tf.compat.v1.graph_util.remove_training_nodes
using tensorflow=1.13.1, onnx=1.5.0, opset=10, tfonnx=1.5.0/82f805
