In [1]:
import numpy as np
import torch
import librosa
import librosa.display
import librosa.feature
import tensorflow.compat.v1 as tf


#from scripts
from Attention import *
from arguments import parse_arguments 
import torch
import numpy as np
from utils import *
from Attention import *
from data_utils import *
from torch.utils.data import DataLoader, Subset, WeightedRandomSampler

from vggish_smoke_test import *;
import vggish_slim
import vggish_params
import vggish_input
%matplotlib inline


Testing your install of VGGish

Log Mel Spectrogram example:  [[-4.47303259 -4.29463765 -4.14939193 ... -3.97474254 -3.94778045
  -3.78685566]
 [-4.48592983 -4.28831745 -4.13994942 ... -3.98374974 -3.94981089
  -3.79512755]
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]
 ...
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]]




INFO:tensorflow:Restoring parameters from vggish_model.ckpt
VGGish embedding:  [-0.43252084 -0.25330514 -0.03891924 -0.16376027 -0.3499182  -0.59936893
 -0.05658104  0.16280285 -0.7555176  -0.08260237 -0.03138635 -0.83147156
 -0.10581692 -0.01420227 -0.1107798  -0.06599119 -0.22666278  0.8060121
 -0.56459844 -0.07349294 -0.06056742 -0.11864138 -0.26290444 -0.4155161
 -0.02423218  0.36676204  0.03564948 -0.549977   -0.00279108 -0.28981644
 -0.57134503  0.3810783   0.1366871   0.9188573   0.80642533 -0.05767322
 -0.13229543 -0.05044432 -0.22702815  0.04124349  0.7088706  -0.72661525
  0.4956671   0.24034092  0.21580261  0.88385975  1.1954073   0.6688216
  0.20919633  0.01531461  0.17449082 -0.6544126  -0.15788004  0.25017852
 -0.26469558 -0.3989996   0.14588487 -0.18502603  0.39927036  0.3041697
  0.1294817  -0.11220933 -0.4023689  -0.5374395  -0.36152244 -0.21291585
  0.5371816  -0.30606014 -0.08813701  0.04871783  0.42514458  0.18669182
 -0.17835425 -0.0693139   0.1470107  -0.2758583  

In [2]:
def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)
  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
            #'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [3]:
def ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])
  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})
  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'
  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]

In [4]:
def _ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])
  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})
  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'
  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch


In [5]:
# Test these new functions with the original test.
tf.compat.v1.disable_eager_execution()

tf.reset_default_graph()

sess = tf.Session()

vgg = CreateVGGishNetwork(0.21);



INFO:tensorflow:Restoring parameters from vggish_model.ckpt


In [32]:
from IPython.display import Video

#Video("multiple3.mp4", embed=True)

In [40]:
y_v, sr_v  = librosa.load('/private/var/hormone03/Dropbox/My Mac (Mac’s MacBook Pro)/Desktop/avid/instrumentClassification/AttentionMIC_master/multiple3.mp4')


In [41]:
X_v = _ProcessWithVGGish(vgg, y_v, sr_v)


In [42]:
#X_v.shape
k_v1, k_v2 = X_v.shape
X_v = np.asarray(X_v).reshape(-1,k_v1,k_v2)
print(X_v.shape)
# Your code here. Aim for 2-4 lines.
X_tst_torch_v = (torch.from_numpy(X_v)).type(torch.float32)
#y_tst_torch_v = torch.from_numpy(y_tst).type(torch.float32)
X_tst_torch_v =(X_tst_torch_v/255)

(1, 1397, 128)


In [43]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [44]:
modelPath = '/private/var/hormone03/Dropbox/My Mac (Mac’s MacBook Pro)/Desktop/avid/instrumentClassification/AttentionMIC_master/trainedModel_0.pth'

model = DecisionLevelSingleAttention(
                freq_bins=128,
                classes_num=20,
                emb_layers=3,
                hidden_units=128,
                drop_rate=0.6)


#Restore model
model.load_state_dict(torch.load(modelPath))
model = model.eval()


model = model.to(device)


with torch.no_grad():
    output_v = model(X_tst_torch_v)

output_v = to_numpy(output_v).ravel()
print(output_v.shape)
print(output_v) 
outputIndex = np.where(output_v >= 0.9)
print(outputIndex)


55336
(20,)
[8.4661275e-02 4.5212246e-03 6.8876579e-02 7.0413955e-02 1.0355338e-01
 7.2207522e-01 4.1174740e-01 5.2094662e-01 2.6439232e-01 3.4639583e-05
 1.6299731e-03 9.8015286e-04 3.3898365e-01 9.9564481e-01 1.5991399e-01
 9.9848616e-01 9.9915504e-01 1.6879686e-03 4.7329325e-02 9.2277890e-01]
(array([13, 15, 16, 19]),)


In [45]:
classMap= {'accordion': 0,
 'banjo': 1,
 'bass': 2,
 'cello': 3,
 'clarinet': 4,
 'cymbals': 5,
 'drums': 6,
 'flute': 7,
 'guitar': 8,
 'mallet_percussion': 9,
 'mandolin': 10,
 'organ': 11,
 'piano': 12,
 'saxophone': 13,
 'synthesizer': 14,
 'trombone': 15,
 'trumpet': 16,
 'ukulele': 17,
 'violin': 18,
 'voice': 19}
inv_map = {v: k for k, v in classMap.items()}


In [46]:
for i in outputIndex[0]:
    print(i)
    print(inv_map[i])
    

13
saxophone
15
trombone
16
trumpet
19
voice


In [20]:
accordion 0.06983575
banjo 0.33101374
bass 0.020017007
cello 0.06161041
clarinet 0.12218426
cymbals 0.0016985144
drums 0.014519502
flute 0.700888
guitar 0.8985557
mallet_percussion 0.5362431
mandolin 0.59339917
organ 0.045906793
piano 0.7231139
saxophone 0.0318367
synthesizer 0.23019123
trombone 0.010454232
trumpet 0.025121506
ukulele 0.4342751
violin 0.25603348
voice 0.0148326345

flute, guitar,mallet_percussion, piano, 

SyntaxError: invalid syntax (<ipython-input-20-e6e4daba5a09>, line 1)