# High Perfromance Audio Preprocessing with tf.data and pedalboard

In [None]:
import typing
from typing import Callable
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
from IPython.display import Audio
import json
import tensorflow_datasets as tfds
import tensorflow as tf
import soxbindings as sox
import pedalboard as pb
from pedalboard import Pedalboard

%matplotlib inline

SR = 22050
MAXVAL = 32767
AUTOTUNE = tf.data.AUTOTUNE

# make pretty plots
plt.style.use('ggplot')
mpl.rc('axes', labelsize=8)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)
mpl.rc('axes', titlepad=20)
mpl.rc('axes', titlesize=10)
mpl.rc('axes', titleweight='normal')
mpl.rc('legend', fontsize=8)
mpl.rcParams['figure.dpi']= 300


Download LJ Speech dataset and extract only the audio data.

In [None]:
dataset, info = tfds.load(
  'ljspeech', split='train',
  download=True, with_info=True)

# transform int16 audio to float32 in [-1, 1]
dataset = dataset.map(
    lambda example_dict: tf.cast(example_dict['speech'], tf.float32) / MAXVAL,
    num_parallel_calls=AUTOTUNE)

Define Sox function for effect preprocessing. We use the library soxbindings since it "works" in multithreading environments, like tf.data, in that it doesn't fail, but it doesn't let you actually use multi-threading

In [None]:
sox_effects = {
    'compand': {},
    'chorus': {},
    'highpass': {'frequency': 100},
    'lowpass': {'frequency': 8000},
    'phaser': {},
    'reverb': {}
}

def get_sox_effect(
    effect_type: str,
    effect_params: dict
    ) -> Callable[[tf.Tensor], np.ndarray]:
    # this allows multi-threading envs
    @sox.sox_context()
    def sox_effect(y: tf.Tensor) -> np.ndarray:
        y = y.numpy()
        tfm = sox.Transformer()
        getattr(tfm, effect_type)(**effect_params)
        y_out = tfm.build_array(input_array=y, sample_rate_in=SR)
        return y_out
    return sox_effect


In [None]:
sox_results = dict()
for effect, params in sox_effects.items():
    # define pipeline for this effect
    sox_dataset = dataset.map(
        lambda speech: tf.py_function(
            get_sox_effect(effect, params),
            [speech],
            tf.float32)
    )
    t = time.time()
    # apply effect to each example
    for elem in sox_dataset:
        pass
    elapsed_time = time.time() - t
    sox_results[effect] = elapsed_time
    print(f'{effect}: {elapsed_time:.2f}s')

json.dump(sox_results, open('./sox_results.json', 'w'))

In [None]:
pedalboard_effects = [
    'Compressor',
    'Chorus',
    'HighpassFilter',
    'LowpassFilter',
    'Phaser',
    'Reverb'
]

def get_pb_effect(
    effect_type: str
    ) -> Callable[[tf.Tensor], np.ndarray]:
    def pb_effect(y: tf.Tensor) -> np.ndarray:
        y = y.numpy()
        effect = getattr(pb, effect_type)()
        y_out = effect(y, sample_rate=SR)
        return y_out
    return pb_effect

In [None]:
pb_results = dict()
for effect in pedalboard_effects:
    # define pipeline for this effect
    pb_dataset = dataset.map(
        lambda speech: tf.py_function(
            get_pb_effect(effect),
            [speech],
            tf.float32),
        num_parallel_calls=AUTOTUNE
    )
    t = time.time()
    # apply effect to each example
    for elem in pb_dataset:
        pass
    elapsed_time = time.time() - t
    pb_results[effect] = elapsed_time
    print(f'{effect}: {elapsed_time:.2f}s')

json.dump(pb_results, open('./pedalboard_results.json', 'w'))

In [None]:
pb_results = json.load(open('./pedalboard_results.json', 'r'))
sox_results = json.load(open('./sox_results.json', 'r'))
labels = pedalboard_effects
pb_times = [value for _, value in pb_results.items()]
sox_times = [value for _, value in sox_results.items()]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects2 = ax.bar(x + width/2, sox_times, width, label='SoxBindings')
rects1 = ax.bar(x - width/2, pb_times, width, label='Pedalboard')

ax.set_ylabel('duration in s')
ax.set_title('Transform LJ Speech with one Effect')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=50)
ax.legend()

ax.bar_label(rects1, padding=3, fmt='%.1f', fontsize=8)
ax.bar_label(rects2, padding=3, fmt='%.1f', fontsize=8)

fig.tight_layout()

## Transform the LJ Speech Dataset with a Signal Chain

In [None]:
# sox effect chain

def get_sox_effect_chain() -> Callable[[tf.Tensor], np.ndarray]:
    tfm = sox.Transformer()
    for effect_type, effect_params in sox_effects.items():
        getattr(tfm, effect_type)(**effect_params)
    @sox.sox_context()
    def sox_effect(y: tf.Tensor) -> np.ndarray:
        y = y.numpy()
        y_out = tfm.build_array(input_array=y, sample_rate_in=SR)
        return y_out
    return sox_effect

sox_results = dict()
# define pipeline for this effect
sox_dataset = dataset.map(
    lambda speech: tf.py_function(
        get_sox_effect_chain(),
        [speech],
        tf.float32)
)
t = time.time()
# apply effect to each example
for elem in sox_dataset:
    pass
elapsed_time = time.time() - t
sox_results['sox_effect_chain'] = elapsed_time
print(f'Sox Effect Chain: {elapsed_time:.2f}s')

json.dump(sox_results, open('./sox_results_chain.json', 'w'))

In [None]:
# pedalboard effect chain

def get_pb_effect_chain() -> Callable[[tf.Tensor], np.ndarray]:
    board = Pedalboard(
        [getattr(pb, effect_type)() for effect_type in pedalboard_effects],
        sample_rate=SR)
    def pb_effect(y: tf.Tensor) -> np.ndarray:
        y = y.numpy()
        y_out = board(y)
        return y_out
    return pb_effect

pb_results = dict()
# define pipeline for this effect
pb_dataset = dataset.map(
    lambda speech: tf.py_function(
        get_pb_effect_chain(),
        [speech],
        tf.float32),
    num_parallel_calls=AUTOTUNE
)
t = time.time()
# apply effect to each example
for elem in pb_dataset:
    pass
elapsed_time = time.time() - t
pb_results['pedalboard_effect_chain'] = elapsed_time
print(f'Pedalboard Effect Chain: {elapsed_time:.2f}s')

json.dump(pb_results, open('./pedalboard_results_chain.json', 'w'))

In [None]:
# pedalboard tf.data effect chain

pb_results = dict()
pb_dataset = dataset
for effect in pedalboard_effects:
    # define pipeline for this effect
    pb_dataset = pb_dataset.map(
        lambda speech: tf.py_function(
            get_pb_effect(effect),
            [speech],
            tf.float32),
        num_parallel_calls=AUTOTUNE
    )
t = time.time()
# apply effect to each example
for elem in pb_dataset:
    pass
elapsed_time = time.time() - t
pb_results['pedalboard_effect_chain'] = elapsed_time
print(f'Pedalboard Effect Chain: {elapsed_time:.2f}s')

json.dump(pb_results, open('./pedalboard_results_tfdata_chain.json', 'w'))

In [None]:
pb_chain = json.load(open('pedalboard_results_chain.json', 'r'))
sox_chain = json.load(open('sox_results_chain.json', 'r'))
pb_tfdata_chain = json.load(open('pedalboard_results_tfdata_chain.json', 'r'))
labels = ['Pedalboard Chain', 'SoxBindings Chain', 'Pedalboard Chain w/ tf.data']

x = [0., 0.5, 1.]  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x[0], pb_chain['pedalboard_effect_chain'], width, label=labels[0])
rects2 = ax.bar(x[1], sox_chain['sox_effect_chain'], width, label=labels[1])
rects3 = ax.bar(x[2], pb_tfdata_chain['pedalboard_effect_chain'], width, label=labels[2])

ax.set_ylabel('duration in s')
ax.set_title('Transform LJ Speech with Effect Chain')
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,      
    top=False,         
    labelbottom=False)
ax.legend()

ax.bar_label(rects1, padding=3, fmt='%.1f', fontsize=8)
ax.bar_label(rects2, padding=3, fmt='%.1f', fontsize=8)
ax.bar_label(rects3, padding=3, fmt='%.1f', fontsize=8)

fig.tight_layout()