Install requirements

Import libraries

In [None]:
!pip install tensorflow tensorflow-datasets librosa

Only grab the dataset where instument == keyboard

In [None]:
# =================== Augmentation Function Library ==================
def noise_injection(audio, noise_factor = 0.001):
  '''Inject random/white noise into the audio'''
  noise = np.random.randn(len(audio))
  return audio + noise_factor * noise

def change_speed(audio, speed_factor = 2):
  '''
  Changes the speed of playback by scaling time
  speed_factor > 1      => speeds up playback
  0 < speed_factor < 1  => slows down playback
  '''
  return librosa.effects.time_stretch(audio, rate=speed_factor)

def change_speed_and_fit(audio, stretch_factor = 0.5):
  '''Changes speed similar to change_speed and fits into the same length as the input audio'''
  input_length = len(audio)
  stretched_audio = librosa.effects.time_stretch(audio.astype('float'), rate=stretch_factor)
  if len(stretched_audio) > input_length:
    return stretched_audio[:input_length]
  else:
    return np.pad(stretched_audio, (0, max(0, input_length - len(stretched_audio))), "constant")

def shift_pitch(audio, steps = 10, sr=16000):
  '''Shifts the pitch up by the specified amount of steps'''
  return librosa.effects.pitch_shift(audio, sr=sr, n_steps=steps)

def shift_time(audio, shift_factor = 0.5, random = False):
  '''Shifts the audio in time by a given factor, keeping the same length of the intial audio'''
  if random:
    shift_factor = shift_factor * 2 * (np.random.uniform() - 0.5)

  start = int(len(audio) * shift_factor)
  if (start > 0):
      return np.pad(audio,(start,0),mode='constant')[0:len(audio)]
  else:
      return np.pad(audio,(0,-start),mode='constant')[0:len(audio)]

In [29]:
import tensorflow as tf
import tensorflow_datasets as tfds
import librosa
import numpy as np

# Constants
KEYBOARD_FAMILY_LABEL = 4 # According to NSynth dataset family label
SAMPLE_RATE = 16000
TRIM_LENGTH = 3 * SAMPLE_RATE  # Trim to the first 3 seconds

# Define the processing function
def process_data(example):
    audio = example['audio']
    instrument_family = example['instrument']['family']
    pitch = example['pitch']

    # Filter keyboard samples
    is_keyboard = tf.equal(instrument_family, KEYBOARD_FAMILY_LABEL)
    # Only process the samples where is_keyboard is True
    def process_keyboard_sample(audio, pitch):
        # Trim the audio
        audio = audio[:TRIM_LENGTH]

        # Convert audio to CQT (Constant-Q Transform) using librosa
        # Set fmin to the frequency of A0 and n_bins to 88
        def compute_cqt(x):
            return np.abs(librosa.cqt(x, sr=SAMPLE_RATE, fmin=librosa.note_to_hz('A0'), n_bins=88, bins_per_octave=12))

        # Here, tf.numpy_function applies a Python function to the TensorFlow tensor
        audio = tf.numpy_function(compute_cqt, [audio], tf.float32)

        # Modify pitch
        pitch = pitch - 21
        return audio, pitch
    # Return the processed audio and pitch, only if the sample is a keyboard
    return tf.cond(is_keyboard, lambda: process_keyboard_sample(audio, pitch), lambda: (audio, pitch))

def filter_keyboard_samples(example):
    return tf.equal(example['instrument']['family'], KEYBOARD_FAMILY_LABEL)

def get_data_loader(data_split, batch_size=64):
    ds = tfds.load('nsynth', split=data_split, as_supervised=False)

    # First, filter out non-keyboard samples
    ds = ds.filter(filter_keyboard_samples)
    ds = ds.map(process_data, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.filter(lambda audio, pitch: tf.reduce_sum(tf.shape(audio)) > 0)  # Filter out empty audio results
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds

# Input to model
batch_size = 64
test_loader = get_data_loader('test', batch_size)
val_loader = get_data_loader('valid', batch_size)
train_loader = get_data_loader('train', batch_size)
classes = list(range(88))

In [30]:
for audio, pitch in test_loader.take(1):
    print("Audio shape:", audio.shape)
    print("Pitch shape:", pitch.shape)

for audio, pitch in val_loader.take(1):
    print("Audio shape:", audio.shape)
    print("Pitch shape:", pitch.shape)

for audio, pitch in train_loader.take(1):
    print("Audio shape:", audio.shape)
    print("Pitch shape:", pitch.shape)

Audio shape: (64, 88, 94)
Pitch shape: (64,)
Audio shape: (64, 88, 94)
Pitch shape: (64,)
Audio shape: (64, 88, 94)
Pitch shape: (64,)


In [36]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
from google.colab import files
import dask.dataframe as dd

In [46]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

class PitchDetectionModel(nn.Module):
    def __init__(self):
        super(PitchDetectionModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=88, out_channels=32, kernel_size=5)  # Adjusted in_channels to 88
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=16, kernel_size=5)
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=5)
        # self.conv4 = nn.Conv1d(in_channels=16, out_channels=64, kernel_size=1)
        # self.conv5 = nn.Conv1d(in_channels=64, out_channels=88, kernel_size=1)

    def forward(self, x):
        x = x.float()  # Convert x to float32 datatype
        x = F.relu(self.conv1(x))
        x = x.view(x.size(0), -1)  # Flatten the tensor

        # x = F.relu(self.conv2(x))
        # x = F.relu(self.conv3(x))
        # x = F.relu(self.conv4(x))
        # x = F.relu(self.conv5(x))
        # x = x.permute(0, 2, 1)

        return x

model = PitchDetectionModel()


def train_net(net, data_loader, num_epochs):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=0.0001)

    for epoch in range(num_epochs):
        print('Epoch:', epoch)
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for i, (inputs, labels) in enumerate(data_loader):
            inputs = torch.tensor(inputs.numpy(), device=device).float()
            labels = torch.tensor(labels.numpy(), device=device).long()

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Get the predictions from the maximum value
            _, predicted = torch.max(outputs, 1)

            # Total number of labels
            total_predictions += labels.size(0)

            # Total correct predictions
            correct_predictions += (predicted == labels).sum().item()

            # Calculate accuracy
            accuracy = correct_predictions / total_predictions

            print(f"Batch {i} Loss: {loss.item()} Accuracy: {accuracy * 100:.2f}%")

        # epoch_loss = running_loss / len(data_loader)
        # epoch_accuracy = correct_predictions / total_predictions
        # print(f"Epoch {epoch + 1} completed. Loss: {epoch_loss:.4f} Accuracy: {epoch_accuracy * 100:.2f}%")

    print('Finished Training')

train_net(model, test_loader, 5)


train_net(model, test_loader, 2)



Epoch: 0
Batch 0 Loss: 7.962560176849365 Accuracy: 0.00%
Batch 1 Loss: 7.894560813903809 Accuracy: 0.00%
Batch 2 Loss: 7.7659831047058105 Accuracy: 0.52%
Batch 3 Loss: 7.764236927032471 Accuracy: 0.78%
Batch 4 Loss: 7.685639381408691 Accuracy: 0.62%
Batch 5 Loss: 7.570768356323242 Accuracy: 0.52%
Batch 6 Loss: 7.541556358337402 Accuracy: 0.45%
Batch 7 Loss: 7.313226699829102 Accuracy: 0.59%
Batch 8 Loss: 7.520594120025635 Accuracy: 0.87%
Batch 9 Loss: 7.407275676727295 Accuracy: 0.94%
Batch 10 Loss: 7.186352252960205 Accuracy: 0.85%
Batch 11 Loss: 7.287949562072754 Accuracy: 0.91%
Epoch: 1
Batch 0 Loss: 7.445340156555176 Accuracy: 0.00%
Batch 1 Loss: 7.228724002838135 Accuracy: 0.00%
Batch 2 Loss: 7.021732807159424 Accuracy: 0.00%
Batch 3 Loss: 7.178116798400879 Accuracy: 0.39%
Batch 4 Loss: 7.101716995239258 Accuracy: 0.31%
Batch 5 Loss: 7.016138076782227 Accuracy: 0.26%
Batch 6 Loss: 6.990074157714844 Accuracy: 0.22%
Batch 7 Loss: 6.760902404785156 Accuracy: 0.39%
Batch 8 Loss: 7.175