Install requirements

Import libraries

In [None]:
!pip install tensorflow tensorflow-datasets librosa

Only grab the dataset where instument == keyboard

In [51]:
import tensorflow as tf
import tensorflow_datasets as tfds
import librosa
import numpy as np

# Constants
KEYBOARD_FAMILY_LABEL = 4 # According to NSynth dataset family label
SAMPLE_RATE = 16000
TRIM_LENGTH = 3 * SAMPLE_RATE  # Trim to the first 3 seconds

def process_data(example):
    audio = example['audio']
    instrument_family = example['instrument']['family']
    pitch = example['pitch']

    is_keyboard = tf.equal(instrument_family, KEYBOARD_FAMILY_LABEL)

    def process_keyboard_sample(audio, pitch):
        audio = audio[:TRIM_LENGTH]
        if pitch < 21:
            # Instead of returning None, return a marker (e.g., a zero-length tensor)
            return tf.zeros((0,)), tf.constant(-1, dtype=tf.int64)
        else:
            pitch = pitch - 21
            return audio, pitch

    return tf.cond(is_keyboard, lambda: process_keyboard_sample(audio, pitch), lambda: (audio, pitch))

def filter_keyboard_samples(example):
    return tf.equal(example['instrument']['family'], KEYBOARD_FAMILY_LABEL)

def filter_invalid_samples(audio, pitch):
    # Check if the sample is valid (not marked for removal)
    return tf.size(audio) > 0 and tf.not_equal(pitch, -1)

def get_data_loader(data_split, batch_size=64, num_batches=None):
    ds = tfds.load('nsynth', split=data_split, as_supervised=False)

    ds = ds.filter(filter_keyboard_samples)
    ds = ds.map(process_data, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.filter(filter_invalid_samples)
    ds = ds.batch(batch_size)
    if num_batches:
        ds = ds.take(num_batches)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    return ds

In [50]:
# Input to model
batch_size = 64
train_loader = get_data_loader('train', batch_size, num_batches=40)
val_loader = get_data_loader('valid', batch_size, num_batches=6)
test_loader = get_data_loader('test', batch_size, num_batches=6)
classes = list(range(88))

for audio, pitch in train_loader.take(5):
    print(pitch)

tf.Tensor(
[85 10 60 33 48 84 21 54 13 33  0  6 49 52 56 17  4 74  6 40 63 20 44 41
  7 22 67 64  8 78 16 48 14 29 87 32 70 84 65 66 30 56 28 85 47 33 24 49
 81 18 21 51 34 61 36  8 10 45 61 70 74 33 57 52], shape=(64,), dtype=int64)
tf.Tensor(
[42 16 62 72 38 63 72 58 35 40 68 67 22  3 47 57 68 77 50 28 45 46 33 65
 30 22 10 74  5 16 81 52 70 27 58 49 17 14 52 19 19 84 15 86 67 16 11 65
 25 55 53 15 24 72 73 51 37  6 76 24 72 13 25 41], shape=(64,), dtype=int64)
tf.Tensor(
[79 31 60 45 83 38 28 25 56  9 38 43 52 86 25 57 16 27 25 25 21 76 59 43
 46 64 43 13 85 26  7 36 54  6 19 39 23  2 34 21 40 54 30 65 55 14 28 49
 86 22 16 54 35 34 70 67 19 30 76 26 31 22 69 59], shape=(64,), dtype=int64)
tf.Tensor(
[59 56 43 62 74 17 15 56 73 67 34 10 73 48 35 41 42 20 67 85 45 17 55 26
  0 16 22 24 87 21 38 70 25 44 71 43 15  3 26 25 52 39 59 17 19 59 41 86
 55 15 13 79 69 42 33 63 32 16 66 49 61 61 47 65], shape=(64,), dtype=int64)
tf.Tensor(
[ 9 45 68 46 39 45  0 19 35 67 18 82 22 74 23 22 68 1

In [49]:
# printing a sample audio and pitch value
for audio, pitch in train_loader.take(1):
    first_sample_audio = audio[0]
    first_sample_pitch = pitch[0]

    # Calculate the maximum and minimum values
    max_value = tf.reduce_max(first_sample_audio)
    min_value = tf.reduce_min(first_sample_audio)

    # Calculate the range (max - min)
    range_value = max_value - min_value

    print(f"First sample audio max value: {max_value.numpy()}")
    print(f"First sample audio min value: {min_value.numpy()}")
    print(f"First sample audio range: {range_value.numpy()}")
    print(f"First sample pitch: {first_sample_pitch.numpy()}")

First sample audio max value: 0.46275418996810913
First sample audio min value: -0.46886491775512695
First sample audio range: 0.9316191077232361
First sample pitch: 85


In [47]:
# get number of batches in each loader
def get_dataset_length(data_loader):
    length = 0
    for _ in data_loader:
        length += 1
    return length

# Use this function to get the length of your data loaders
test_loader_length = get_dataset_length(test_loader)
val_loader_length = get_dataset_length(val_loader)
train_loader_length = get_dataset_length(train_loader)

print(f"Train loader length: {train_loader_length}")
print(f"Validation loader length: {val_loader_length}")
print(f"Test loader length: {test_loader_length}")

Train loader length: 40
Validation loader length: 6
Test loader length: 6


In [48]:
# get number of samples in each loader
def get_dataset_sample_count(data_loader):
    total_samples = 0
    for audio, pitch in data_loader:
        # Count the number of samples in each batch
        batch_samples = tf.shape(audio)[0]  # assuming audio is a 2D tensor [batch_size, features]
        total_samples += batch_samples
    return total_samples

# Use this function to get the number of samples in your data loaders
test_samples_count = get_dataset_sample_count(test_loader)
val_samples_count = get_dataset_sample_count(val_loader)
train_samples_count = get_dataset_sample_count(train_loader)

print(f"Train loader samples: {train_samples_count}")
print(f"Validation loader samples: {val_samples_count}")
print(f"Test loader samples: {test_samples_count}")

Train loader samples: 2560
Validation loader samples: 384
Test loader samples: 384
