In [1]:
import os 
import io
import random
import h5py
import numpy as np
from keras import models
from decrypt import read_encrypted_tar_audio_file
from kapre.time_frequency import Melspectrogram

Using TensorFlow backend.


In [2]:
def get_raw_windows_from_encrypted_audio(audio_path, tar_data, sample_rate=8000, clip_duration=10,
                                         decrypt_url='https://decrypt-sonyc.engineering.nyu.edu/decrypt',
                                         cacert_path='/home/jtc440/sonyc/decrypt/CA.pem',
                                         cert_path='/home/jtc440/sonyc/decrypt/jason_data.pem',
                                         key_path='/home/jtc440/sonyc/decrypt/sonyc_key.pem'):
    
    audio = read_encrypted_tar_audio_file(audio_path,
                                          enc_tar_filebuf=tar_data,
                                          sample_rate=sample_rate,
                                          url=decrypt_url,
                                          cacert=cacert_path,
                                          cert=cert_path,
                                          key=key_path)[0]
    if audio is None:
        return None

    audio_len = int(sample_rate * clip_duration)

    # Make sure audio is all consistent length (10 seconds)
    if len(audio) > audio_len:
        audio = audio[:audio_len]
    elif len(audio) < audio_len:
        pad_len = audio_len - len(audio)
        audio = np.pad(audio, (0, pad_len), mode='constant')

    # Return raw windows
    return get_audio_windows(audio, sr=sample_rate)


def get_audio_windows(audio, sr=8000, center=True, hop_size=0.5):
    """
    Similar to openl3.get_embedding(...)
    """

    def _center_audio(audio, frame_len):
        """Center audio so that first sample will occur in the middle of the first frame"""
        return np.pad(audio, (int(frame_len / 2.0), 0), mode='constant', constant_values=0)

    def _pad_audio(audio, frame_len, hop_len):
        """Pad audio if necessary so that all samples are processed"""
        audio_len = audio.size
        if audio_len < frame_len:
            pad_length = frame_len - audio_len
        else:
            pad_length = int(np.ceil((audio_len - frame_len) / float(hop_len))) * hop_len \
                         - (audio_len - frame_len)

        if pad_length > 0:
            audio = np.pad(audio, (0, pad_length), mode='constant', constant_values=0)

        return audio

    # Check audio array dimension
    if audio.ndim > 2:
        raise AssertionError('Audio array can only be be 1D or 2D')
    elif audio.ndim == 2:
        # Downmix if multichannel
        audio = np.mean(audio, axis=1)

    audio_len = audio.size
    frame_len = sr
    hop_len = int(hop_size * sr)

    if audio_len < frame_len:
        warnings.warn('Duration of provided audio is shorter than window size (1 second). Audio will be padded.')

    if center:
        # Center audio
        audio = _center_audio(audio, frame_len)

    # Pad if necessary to ensure that we process all samples
    audio = _pad_audio(audio, frame_len, hop_len)

    # Split audio into frames, copied from librosa.util.frame
    n_frames = 1 + int((len(audio) - frame_len) / float(hop_len))
    x = np.lib.stride_tricks.as_strided(audio, shape=(frame_len, n_frames),
                                        strides=(audio.itemsize, hop_len * audio.itemsize)).T

    # Add a channel dimension
    # x = x.reshape((x.shape[0], 1, x.shape[-1]))

    return x

In [3]:
#'/scratch/sk7898/embedding_approx_mse/models/sonyc/pca/dpp/day/500000/pca_batch_500000_len_128_kernel_linear/8000_64_160_1024_half_fmax_None/20201004094550'
model_dir = '/scratch/sk7898/embedding_approx_mse/models/sonyc/mse_original/8000_64_160_1024_fmax_None/20200909145902'
weight_path = os.path.join(model_dir, 'model_best_valid_loss.h5')
model = models.load_model(weight_path, custom_objects={'Melspectrogram': Melspectrogram})

#print(model.summary())










In [4]:
train_files = []
data_dir = '/scratch/sk7898/sonyc_30mil/train'

for i in range(60):
    part = random.randint(0, 15)
    split = random.randint(0, 2000)
    fname = 'sonyc_ndata=2500000_part={}_split={}.h5'.format(part, split)
    if os.path.exists(os.path.join(data_dir, fname)):
        train_files.append(fname)

print('{} from training set: {}'.format(len(train_files), train_files))

47 from training set: ['sonyc_ndata=2500000_part=7_split=1264.h5', 'sonyc_ndata=2500000_part=7_split=25.h5', 'sonyc_ndata=2500000_part=5_split=557.h5', 'sonyc_ndata=2500000_part=12_split=1498.h5', 'sonyc_ndata=2500000_part=10_split=965.h5', 'sonyc_ndata=2500000_part=7_split=1969.h5', 'sonyc_ndata=2500000_part=9_split=516.h5', 'sonyc_ndata=2500000_part=8_split=464.h5', 'sonyc_ndata=2500000_part=4_split=1341.h5', 'sonyc_ndata=2500000_part=8_split=364.h5', 'sonyc_ndata=2500000_part=9_split=352.h5', 'sonyc_ndata=2500000_part=1_split=1715.h5', 'sonyc_ndata=2500000_part=11_split=226.h5', 'sonyc_ndata=2500000_part=2_split=919.h5', 'sonyc_ndata=2500000_part=10_split=508.h5', 'sonyc_ndata=2500000_part=0_split=819.h5', 'sonyc_ndata=2500000_part=0_split=1617.h5', 'sonyc_ndata=2500000_part=10_split=1671.h5', 'sonyc_ndata=2500000_part=2_split=91.h5', 'sonyc_ndata=2500000_part=14_split=1583.h5', 'sonyc_ndata=2500000_part=11_split=1558.h5', 'sonyc_ndata=2500000_part=5_split=1681.h5', 'sonyc_ndata=250

In [5]:
mse_error = 0
print('Error between openl3 reference embeddings and reduced input student model\'s predicted embeddings')

for fname in train_files:
    idxs = sorted(random.sample(range(1024), 10))
    data_batch_path = os.path.join(data_dir, fname)
    data_blob = h5py.File(data_batch_path, 'r')
    audio_batch = np.array(data_blob['audio'][idxs])[:, np.newaxis, :]
    ref_embs = data_blob['l3_embedding'][idxs]
    pred_embs = model.predict(audio_batch)
    mse_error += np.mean((ref_embs - pred_embs)**2)
print('MSE on Training subset: ', mse_error/len(train_files))

Error between openl3 reference embeddings and reduced input student model's predicted embeddings
MSE on Training subset:  0.0871038207031311


In [15]:
feats_list = []
audio_list = []
audio_dir = '/scratch/work/sonyc'
indices_dir = '/scratch/work/sonyc/indices/2017'
feats_dir = '/scratch/work/sonyc/features/openl3/2017'
test_sensors = [
    'sonycnode-b827ebc6dcc6.sonyc_features_openl3.h5',
    'sonycnode-b827ebba613d.sonyc_features_openl3.h5',
    'sonycnode-b827ebad073b.sonyc_features_openl3.h5',
    'sonycnode-b827eb0fedda.sonyc_features_openl3.h5',
    'sonycnode-b827eb44506f.sonyc_features_openl3.h5'
]
for path in test_sensors:
    h5_path = os.path.join(feats_dir, path)
    f = h5py.File(h5_path, 'r')
    num_datasets = f[list(f.keys())[0]].shape[0]
    for i in range(10):
        dataset_index = np.random.randint(0, num_datasets)
        num_features = f[list(f.keys())[0]][dataset_index]['openl3'].shape[0]
        index = h5py.File(
            os.path.join(
                indices_dir, 
                os.path.basename(h5_path).split('.')[0]+'.sonyc_recording_index.h5'), 'r'
                )
        audio_file_name = os.path.join(audio_dir,
                                       index[list(index.keys())[0]][dataset_index]['day_hdf5_path'].decode()
                                       )
        row = index[list(index.keys())[0]][dataset_index]['day_h5_index']
        audio_file = h5py.File(audio_file_name, 'r')
        tar_data = io.BytesIO(audio_file['recordings'][row]['data'])
        raw_audio = get_raw_windows_from_encrypted_audio(audio_file_name, tar_data, sample_rate=8000)

        if raw_audio is None:
            continue
        feature_index = np.random.randint(0, num_features)
        feats_list.append(f[list(f.keys())[0]][dataset_index]['openl3'][feature_index])
        audio_list.append(raw_audio[feature_index])

print(len(audio_list), len(feats_list))


50 50


In [16]:
test_error = 0
for audio, ref_embs in zip(audio_list, feats_list):
    audio_batch = audio.reshape((1, 1, audio.shape[-1]))
    pred_embs = model.predict(audio_batch)
    test_error += np.mean((ref_embs - pred_embs)**2)
print('MSE on Test subset: ', mse_error/len(audio_list))

MSE on Test subset:  0.35140286549925803
