In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import logging
import random
import csv
import json
import glob
import numpy as np
import resampy
import tensorflow as tf
import soundfile as sf
import librosa
import pandas as pd

LOGGER = logging.getLogger('emb-gen-ust')
LOGGER.setLevel(logging.DEBUG)

In [2]:
def load_audio(path, sr):
    """
    Load audio file
    """
    data, sr_orig = sf.read(path, dtype='float32', always_2d=True)
    data = data.mean(axis=-1)

    if sr_orig != sr:
        data = resampy.resample(data, sr_orig, sr)

    return data

def amplitude_to_db(S, amin=1e-10, dynamic_range=80.0):
    magnitude = np.abs(S)
    power = np.square(magnitude, out=magnitude)
    ref_value = power.max()

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= log_spec.max()

    log_spec = np.maximum(log_spec, -dynamic_range)

    return log_spec

In [32]:
def get_l3_frames_uniform_tflite(audio, interpreter, input_index, output_index, output_shape,
                                 n_fft=2048, n_mels=256, mel_hop_length=242,
                                 hop_size=0.1, sr=48000, fmax=None, embedding_length=256):
    
    if type(audio) == str:
        audio = load_audio(audio, sr)

    hop_size = hop_size
    hop_length = int(hop_size * sr)
    frame_length = sr * 1

    audio_length = len(audio)
    if audio_length < frame_length:
        # Make sure we can have at least one frame of audio
        pad_length = frame_length - audio_length
    else:
        # Zero pad so we compute embedding on all samples
        pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
                     - (audio_length - frame_length)

    if pad_length > 0:
        # Use (roughly) symmetric padding
        left_pad = pad_length // 2
        right_pad= pad_length - left_pad
        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
   
    frames = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
    X = []
    for frame in frames:
        S = np.abs(librosa.core.stft(frame, n_fft=n_fft, hop_length=mel_hop_length,\
                                     window='hann', center=True,\
                                     pad_mode='constant'))
        S = librosa.feature.melspectrogram(sr=sr, S=S, n_mels=n_mels, fmax=fmax,
                                           power=1.0, htk=True)
        S = amplitude_to_db(np.array(S))
        X.append(S)
   
    predictions = []
    for idx in range(len(X)):
        #print(np.array(X[idx]).shape)
        x = np.array(X[idx])[np.newaxis, :, :, np.newaxis].astype(np.float32)
        interpreter.set_tensor(input_index, x)
        interpreter.invoke()
        output = interpreter.get_tensor(output_index)
        predictions.append(np.reshape(output, (output.shape[0], output.shape[-1])))

    return np.array(predictions)

In [33]:
def generate_sonyc_ust_data(annotation_path, dataset_dir, output_dir, tflite_model_path, hop_size=0.1,
                            n_fft=1024, n_mels=64, mel_hop_length=160, sr=8000, fmax=None):
    
    print("* Loading annotations.")
    annotation_data = pd.read_csv(annotation_path).sort_values('audio_filename')

    os.makedirs(output_dir, exist_ok=True)

    df = annotation_data[['split', 'audio_filename']].drop_duplicates()
    row_iter = df.iterrows()

    interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    input_shape = input_details[0]['shape'][1:]
    output_shape = output_details[0]['shape'][1:]
    input_index = input_details[0]['index']
    output_index = output_details[0]['index']
    emb_len = output_shape[-1]
    
    interpreter.allocate_tensors()
    
    LOGGER.info('* Extracting embeddings.')

    c = 0
    for _, row in row_iter:
        c = c + 1
        filename = row['audio_filename']
        split_str = row['split']
        audio_path = os.path.join(dataset_dir, split_str, filename)
        output_path = os.path.join(output_dir, os.path.splitext(filename)[0] + '.npz')

        if c == 2:
            return
            
        if not os.path.exists(audio_path):
            LOGGER.info('Audio file {} doesn''t exist'.format(audio_path))
            continue

        if os.path.exists(output_path):
            LOGGER.info('Output file {} already exists'.format(output_path))
            return
            
        X = get_l3_frames_uniform_tflite(audio_path, interpreter, input_index, output_index,
                                         output_shape, hop_size=hop_size, n_fft=n_fft,
                                         n_mels=n_mels, mel_hop_length=mel_hop_length,
                                         sr=sr, fmax=fmax, embedding_length=emb_len)
        print(X)
        if X is None:
            LOGGER.error('Could not generate data for {}'.format(audio_path))
            return
        
        np.savez(output_path, embedding=X)

In [37]:
if __name__ == '__main__':
    
    data_dir = '/beegfs/dr2915/sonyc_ust'
    out_prefix = '/scratch/sk7898/embeddings/features/sonyc_ust/l3'
    model_des = 'l3_audio_20200304152812_8000_64_160_1024_half'
    #quant_mode = 'size_float16'
    #quant_mode = 'size_int8'
    quant_mode = 'size_int8_uint8'
    #quant_mode = 'default_int8_uint8' 
    #quant_mode = 'latency_int8_uint8' 
    annotation_path = os.path.join(data_dir, 'annotations.csv')
    dataset_output_dir = os.path.join(out_prefix, model_des, quant_mode)
    model_dir = os.path.join('/scratch/sk7898/quantization', model_des)
    model_path = os.path.join(model_dir, 'quantized_'+ quant_mode + '.tflite')
    
    splits = model_des.split('_')
    hop_size = 0.1
    samp_rate = int(splits[3])
    n_mels = int(splits[4])
    mel_hop_length = int(splits[5])
    n_fft = int(splits[6])
    
    generate_sonyc_ust_data(annotation_path=annotation_path, dataset_dir=data_dir, output_dir=dataset_output_dir,\
                            tflite_model_path=model_path, hop_size=hop_size, n_fft=n_fft, n_mels=n_mels,\
                            mel_hop_length=mel_hop_length, sr=samp_rate)

* Loading annotations.
[[[1.7133679  0.7969153  0.49807215 ... 1.9325196  1.4344475  1.2352188 ]]

 [[1.7133679  1.016067   0.3785348  ... 1.8926739  1.3348331  1.2152958 ]]

 [[1.8329052  1.0359899  0.21915174 ... 1.7332908  1.4543704  1.2352188 ]]

 ...

 [[1.4942162  1.0758357  0.6574551  ... 1.8329052  1.6137534  1.2551416 ]]

 [[1.5539849  0.9363755  0.49807215 ... 1.7930593  1.4145247  1.6137534 ]]

 [[1.6137534  0.47814918 0.6176094  ... 1.9524424  1.4942162  1.5739077 ]]]


In [36]:
for f in os.listdir(dataset_output_dir):
    l = np.load(os.path.join(dataset_output_dir, f))
    print(l['embedding'])

[[[1.7597065  0.8136277  0.49196088 ... 1.9489222  1.4380397  1.2488239 ]]

 [[1.7029417  1.021765   0.39735317 ... 1.8921574  1.3812749  1.2488239 ]]

 [[1.7975496  1.0028435  0.2459805  ... 1.7218633  1.4380397  1.2109808 ]]

 ...

 [[1.4758828  1.0596082  0.62441194 ... 1.8353927  1.6272554  1.3245102 ]]

 [[1.5704907  0.9460788  0.5108825  ... 1.8164711  1.4001966  1.6272554 ]]

 [[1.6083338  0.5298041  0.58656883 ... 1.9678438  1.4758828  1.6083338 ]]]
