In [1]:
import numpy as np
from scipy.io import wavfile
import six
import tensorflow as tf

import audioset.vggish_input as vggish_input
import audioset.vggish_params as vggish_params
import audioset.vggish_postprocess as vggish_postprocess
import audioset.vggish_slim as vggish_slim

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
vggish_params

<module 'audioset.vggish_params' from '/Users/nsteins/Documents/LaughDetector/audioset/vggish_params.py'>

In [3]:
flags = tf.app.flags

flags.DEFINE_string(
    'wav_file', None,
    'Path to a wav file. Should contain signed 16-bit PCM samples. '
    'If none is provided, a synthetic sound is used.')

flags.DEFINE_string(
    'checkpoint', 'audioset/vggish_model.ckpt',
    'Path to the VGGish checkpoint file.')

flags.DEFINE_string(
    'pca_params', 'audioset/vggish_pca_params.npz',
    'Path to the VGGish PCA parameters file.')

flags.DEFINE_string(
    'tfrecord_file', None,
    'Path to a TFRecord file where embeddings will be written.')

FLAGS = flags.FLAGS


In [4]:
if FLAGS.wav_file:
    wav_file = FLAGS.wav_file
else:
    # Write a WAV of a sine wav into an in-memory file object.
    num_secs = 5
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)
    # Convert to signed 16-bit samples.
    samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
    wav_file = six.BytesIO()
    wavfile.write(wav_file, sr, samples)
    wav_file.seek(0)
examples_batch = vggish_input.wavfile_to_examples(wav_file)
print(examples_batch)

[[[-4.47771674 -4.29076017 -4.1532819  ... -3.98410919 -3.93030654
   -3.76891238]
  [-4.487119   -4.28663482 -4.14313869 ... -3.98851528 -3.9346542
   -3.78824514]
  [-4.460583   -4.29542372 -4.14568167 ... -3.97235414 -3.9172474
   -3.79094514]
  ...
  [-4.45912554 -4.29392363 -4.14827892 ... -3.9534104  -3.94283579
   -3.78986623]
  [-4.45680333 -4.29347445 -4.15260834 ... -3.96181455 -3.92302184
   -3.7812338 ]
  [-4.46679676 -4.28966581 -4.15162289 ... -3.95698292 -3.94577437
   -3.77947337]]

 [[-4.46342334 -4.29210809 -4.1494392  ... -3.96689471 -3.94103314
   -3.78345147]
  [-4.46518298 -4.29021104 -4.14963754 ... -3.9627394  -3.92718488
   -3.78323181]
  [-4.46232002 -4.29156065 -4.15015574 ... -3.96022267 -3.94609914
   -3.79095732]
  ...
  [-4.47229232 -4.29333028 -4.15550377 ... -3.96550264 -3.92026994
   -3.78052026]
  [-4.47493192 -4.2918865  -4.15142573 ... -3.98084516 -3.93156007
   -3.7891335 ]
  [-4.47940576 -4.29223346 -4.14987884 ... -3.96548313 -3.91986638
   -3.79

In [5]:
# Prepare a postprocessor to munge the model embeddings.
pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

# If needed, prepare a record writer to store the postprocessed embeddings.
writer = tf.python_io.TFRecordWriter(
    FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    print(embedding_batch)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)

    # Write the postprocessed embeddings as a SequenceExample, in a similar
    # format as the features released in AudioSet. Each row of the batch of
    # embeddings corresponds to roughly a second of audio (96 10ms frames), and
    # the rows are written as a sequence of bytes-valued features, where each
    # feature value contains the 128 bytes of the whitened quantized embedding.
    seq_example = tf.train.SequenceExample(
        feature_lists=tf.train.FeatureLists(
            feature_list={
                vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(
                        feature=[
                            tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ]
                    )
            }
        )
    )
    print(seq_example)
    if writer:
        writer.write(seq_example.SerializeToString())

if writer:
    writer.close()

INFO:tensorflow:Restoring parameters from audioset/vggish_model.ckpt
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.71822399e-01
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 2.36853957e-02 0.00000000e+00 0.00000000e+00
  0.00000000e+00 7.81700015e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 3.63987952e-01 0.00000000e+00 0.00000000e+00
  9.56024230e-03 0.00000000e+00 0.00000000e+00 3.86006594e-01
  1.06383190e-01 9.26400661e-01 8.06896687e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 8.68075788e-02
  6.98059082e-01 0.00000000e+00 4.93896574e-01 2.39039749e-01
  2.26331875e-01 8.92517567e-01 1.19550240e+00 6.63886309e-01
  2.25392967e-01 1.75379515e-02 1.64439484e-01 0.00000000e+00
  0.00000000e+00 2.42111042e-01 0.00000000e+00 0.00000000e+00
  1.59409791e-01 0.00000000e+00 3.97405505e-01 2.94068903e-01
 