In [1]:
## Convert Tensorflow weights to PyTorch weights and save model
# Download VGGish 
# Download the audioset directory using subversion
# !apt-get -qq install subversion
#!svn checkout https://github.com/tensorflow/models/trunk/research/audioset

# Download audioset requirements
#!pip install numpy scipy
#!pip install resampy tensorflow six soundfile

# grab the VGGish model checkpoints & PCA params
#!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
#!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz

# Test install
#!mv audioset/* .
# from vggish_smoke_test import *

In [2]:
import tensorflow as tf
import vggish_slim

vggish_dict = {}
# load the model and get info 
with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim(training=True)
    vggish_slim.load_vggish_slim_checkpoint(sess,"vggish_model.ckpt")
    
    tvars = tf.trainable_variables()
    tvars_vals = sess.run(tvars)

    for var, val in zip(tvars, tvars_vals):
#         print(var.name, val)  # Prints the name of the variable alongside its value.
        print("%s" % (var.name))
        print("\t" + str(var.shape))
        vggish_dict[var.name] = val
    print("values written to vggish_dict")


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
vggish/conv1/weights:0
	(3, 3, 1, 64)
vggish/conv1/biases:0
	(64,)
vggish/conv2/weights:0
	(3, 3, 64, 128)
vggish/conv2/biases:0
	(128,)
vggish/conv3/conv3_1/weights:0
	(3, 3, 128, 256)
vggish/conv3/conv3_1/biases:0
	(256,)
vggish/conv3/conv3_2/weights:0
	(3, 3, 256, 256)
vggish/conv3/conv3_2/biases:0
	(256,)
vggish/conv4/conv4_1/weights:0
	(3, 3, 256, 512)
vggish/conv4/conv4_1/biases:0
	(512,)
vggish/conv4/conv4_2/weights:0
	(3, 3, 512, 512)
vggish/conv4/conv4_2/biases:0
	(512,)

In [11]:
import torch
import torch.nn as nn
import numpy as np
import torchvggish
# Define torch model for vggish
# From vggish_slim:
# The VGG stack of alternating convolutions and max-pools.
#     net = slim.conv2d(net, 64, scope='conv1')
#     net = slim.max_pool2d(net, scope='pool1')
#     net = slim.conv2d(net, 128, scope='conv2')
#     net = slim.max_pool2d(net, scope='pool2')
#     net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
#     net = slim.max_pool2d(net, scope='pool3')
#     net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
#     net = slim.max_pool2d(net, scope='pool4')
#     # Flatten before entering fully-connected layers
#     net = slim.flatten(net)
#     net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
#     # The embedding layer.
#     net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')

vggish_list = list(vggish_dict.values())
def param_generator(conv=False, bias=False):
    param = vggish_list.pop(0)
    if conv:
        transposed = np.transpose(param, (3, 2, 0, 1))
    else:
        transposed = np.transpose(param) # bias gets ignored this way
    to_torch = torch.from_numpy(transposed)
    result = torch.nn.Parameter(to_torch)
    yield result

class VGGish(nn.Module):
    
    def __init__(self):
        super(VGGish, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(  1,  64, kernel_size=3, padding=1), # [batch_size, 64, 64, 96]
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),         # [batch_size, 64, 32, 48]  
            nn.Conv2d( 64, 128, kernel_size=3, padding=1), # [batch_size, 128, 32, 48]
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),         # [batch_size, 128, 16, 24]
            nn.Conv2d(128, 256, kernel_size=3, padding=1), # [batch_size, 256, 16, 24]
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1), # [batch_size, 256, 16, 24]
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),         # [batch_size, 256, 8, 12]
            nn.Conv2d(256, 512, kernel_size=3, padding=1), # [batch_size, 512, 8, 12]
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1), # [batch_size, 512, 8, 12]
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)          # [batch_size, 512, 4, 6] 
        )
        self.embeddings = nn.Sequential(
            nn.Linear(512*4*6, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 128),
            nn.ReLU(inplace=True))
        # extract weights from `vggish_list`
        for seq in (self.features, self.embeddings):
            for layer in seq:
                if type(layer).__name__ != "MaxPool2d" and type(layer).__name__ != "ReLU":
                    if type(layer).__name__ == "Conv2d":
                        layer.weight = next(param_generator(conv=True))
                    else:
                        layer.weight = next(param_generator())
                    layer.bias = next(param_generator())
            
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0),-1)
        x = self.embeddings(x)
        return x

net = VGGish()
net.eval()

# Save model to disk
torch.save(net, "./vggish-model.pth")

# Save weights
model = torchvggish.VGGish()
model.load_state_dict(net.state_dict(), strict=False)
torch.save(model.state_dict(), "./vggish-weights.pth")

  "type " + obj.__name__ + ". It won't be checked "


In [13]:
# find the first 8 digits to rename vggish.pth to 
!shasum -a 256 ./vggish-weights.pth

824920086b9ba86f29fcc97607d44a0a409f001e950b4e2ab5e43a5169ea9415  ./vggish-weights.pth


In [42]:
# comparison
import os
import random
import gzip
import vggish_input
import vggish_postprocess
import vggish_params
from tqdm import tqdm_notebook
import scipy as sp

distances = []
for i in tqdm_notebook(range(5)):

    def example_generator(base_path):
        """
        Emits random examples from partition of audioset
        """
        eval_files = os.listdir(base_path)
        random.shuffle(eval_files)
        while True:
            eg_file = base_path + eval_files.pop(0)
            with gzip.open(eg_file, 'rb') as wav_file:
                yield vggish_input.wavfile_to_examples(wav_file)

    eval_path = "../../data/audioset_eval/"
    data_generator = example_generator(eval_path)
    pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz")
    examples_batch = next(data_generator)

    # Create an embedding with tensorflow vggish
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt")
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [tf_embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: examples_batch})
        tf_postprocessed_batch = pproc.postprocess(tf_embedding_batch)
    #     print(tf_postprocessed_batch)


    #Create an embedding with pytorch vggish
    pt_examples_batch = examples_batch[:,None,:,:]

    # print(examples_batch.shape)
    # pt_examples_batch = np.transpose(pt_examples_batch, (0, 3, 1, 2))
    pt_examples_batch = torch.from_numpy(pt_examples_batch).float()
    pt_embedding_batch = net.forward(pt_examples_batch)
    pt_embedding_batch = pt_embedding_batch.data.numpy()
    pt_postprocessed_batch = pproc.postprocess(pt_embedding_batch)

    # print("PyTorch: {}".format(pt_postprocessed_batch[0]))
    # print("TensorF: {}".format(tf_postprocessed_batch[0]))
    for i in range(len(pt_postprocessed_batch)):
        distances.append(sp.spatial.distance.cosine(pt_postprocessed_batch[i], 
                                                    tf_postprocessed_batch[i]))
print("Mean Distance of embeddings (0=identical, -1=orthogonal): {}".format(np.mean(distances)))
print("Min distance of embeddings: {}".format(np.min(distances)))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

INFO:tensorflow:Restoring parameters from vggish_model.ckpt
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
INFO:tensorflow:Restoring parameters from vggish_model.ckpt

Mean Distance of embeddings (0=identical, -1=orthogonal): -0.41687705060396363
Min distance of embeddings: -0.9207252786073556
