<a href="https://colab.research.google.com/github/jeffreyroh2002/Music-Descriptify/blob/main/instr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#download your dataset using wget keyword
!wget https://zenodo.org/records/1432913/files/openmic-2018-v1.0.0.tgz?download=1
#unzip your dataset
!tar -xzvf openmic-2018-v1.0.0.tgz?download=1
#I recommend you to delete the tgz file for your disk storage
!rm openmic-2018-v1.0.0.tgz?download=1

In [None]:
#there are some requirements needed to run the codes below
!pip install madmom==0.16.1

**Load the npz file**

In [9]:
import numpy as np

#set some paths and load the npz file
AUDIO_PATH = "openmic-2018/audio/"
NPZ_PATH = "openmic-2018/openmic-2018.npz"
TRAIN_SPLIT_PATH = "openmic-2018/partitions/split01_train.csv"
TEST_SPLIT_PATH = "openmic-2018/partitions/split01_test.csv"

try:
  OPENMIC = np.load(NPZ_PATH, allow_pickle = True)
except FileNotFoundError:
  print("[Error] Cannot find the npz file! Check if you downloaded your dataset properly")

Y_true, Y_mask, sample_key = OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']

**Load the split files**

In [10]:
import pandas as pd


try:
  split_train = pd.read_csv(TRAIN_SPLIT_PATH, header = None)
  split_test = pd.read_csv(TEST_SPLIT_PATH, header = None)
except FileNotFoundError:
  print("[Error] Cannot find the split filepath! Check if you downloaded your dataset properly")

#these variables contain keys of each audio file
train_set = split_train[0].values         #total 14915 files
test_set = split_test[0].values           #total 5085 files
print(len(train_set), len(test_set))

14915 5085


In [11]:
import glob
import os

audio_filenames = glob.glob(AUDIO_PATH + "*/*.ogg")
assert len(audio_filenames) >= 1, "No audio files are found! Check if you downloaded your dataset properly"

for i, filename in enumerate(audio_filenames):
  key = filename.split('/')[-1]
  os.rename(filename, AUDIO_PATH + key)
  if i % 1000 == 0:
    print(f"{i}th file renaming completed!")

empty_dirs = glob.glob(AUDIO_PATH + "???")
print("removing empty directories...")
for dir in empty_dirs:
  os.rmdir(dir)

audio_filenames = glob.glob(AUDIO_PATH + "*.ogg")

0th file renaming completed!
1000th file renaming completed!
2000th file renaming completed!
3000th file renaming completed!
4000th file renaming completed!
5000th file renaming completed!
6000th file renaming completed!
7000th file renaming completed!
8000th file renaming completed!
9000th file renaming completed!
10000th file renaming completed!
11000th file renaming completed!
12000th file renaming completed!
13000th file renaming completed!
14000th file renaming completed!
15000th file renaming completed!
16000th file renaming completed!
17000th file renaming completed!
18000th file renaming completed!
19000th file renaming completed!
removing empty directories...


In [12]:
#define the processor
# try:
from madmom.processors import ParallelProcessor, SequentialProcessor
from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.stft import ShortTimeFourierTransformProcessor
from madmom.audio.spectrogram import FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor

# except ImportError:
#   raise ImportError
#   print("Go to the file [/usr/local/lib/{python_version}/dist-packages/madmom/processors.py] and change collections -> collections.abc in line 23")

FPS = 100
FFT_SIZE = 2048
NUM_BANDS = 12

class PreProcessor(SequentialProcessor):
  def __init__(self, frame_size = FFT_SIZE, num_bands = NUM_BANDS, log = np.log, add = 1e-6, fps = FPS):
    #The signalProcessor class is a basic signal processor
    #it works like a librosa.load function
    sig = SignalProcessor(num_channels = 1, sample_rate = 44100)
    frames = FramedSignalProcessor(frame_size = frame_size, fps = fps)
    stft = ShortTimeFourierTransformProcessor()
    filt = FilteredSpectrogramProcessor(num_bands = num_bands)
    spec = LogarithmicSpectrogramProcessor(log = log, add = add)
    super(PreProcessor, self).__init__((sig, frames, stft, filt, spec, np.array))
    self.fps = fps

In [13]:
idx_train = []
idx_test = []
for i, key in enumerate(sample_key):
  if key in train_set:
    idx_train.append(i)
  else:
    idx_test.append(i)

idx_valid = idx_test[:int(len(idx_test) * 0.1)]
idx_test = idx_test[int(len(idx_test) * 0.1):]

print(len(idx_train), len(idx_valid), len(idx_test))

14915 508 4577


In [14]:
import tensorflow as tf
from tensorflow import keras

def train_generator():
  preprocessor = PreProcessor()
  label_num = 20
  for index in idx_train:
    key = sample_key[index]
    audio_filename = AUDIO_PATH + key + ".ogg"
    s = preprocessor(audio_filename)
    y_true = Y_true[index]
    y_mask = Y_mask[index]
    y = np.where(y_mask, y_true, -1)

    # (tf.float64, tf.float64), ((1000, 81, 1,), (20,))
    x = tf.constant(s[:1000][:], dtype = tf.float64)
    x = tf.expand_dims(x, axis = -1)
    y = tf.constant(y, dtype = tf.float64)

    yield (x, y)

def valid_generator():
  preprocessor = PreProcessor()
  label_num = 20
  X = []
  Y = []
  for i, index in enumerate(idx_valid):
    if i % 100 == 0:
      print(f"{i}th  file completed")
    key = sample_key[index]
    audio_filename = AUDIO_PATH + key + ".ogg"
    s = preprocessor(audio_filename)
    y_true = Y_true[index]
    y_mask = Y_mask[index]
    y = np.where(y_mask, y_true, -1)

    # (tf.float64, tf.float64), ((1000, 81, 1,), (20,))
    x = tf.constant(s[:1000][:], dtype = tf.float64)
    x = tf.expand_dims(x, axis = -1)
    x = tf.expand_dims(x, axis = 0)
    y = tf.constant(y, dtype = tf.float64)
    y = tf.expand_dims(y, axis = 0)
    X.append(x)
    Y.append(y)

  X = tf.stack(X)
  Y = tf.stack(Y)

  return X, Y

In [15]:
def residual_block(x, i, activation, num_filters, kernel_size, padding, dropout_rate=0, name=''):
    # name of the layer
    name = name + '_dilation_%d' % i
    # 1x1 conv. of input (so it can be added as residual)
    res_x = keras.layers.Conv1D(num_filters, 1, padding='same')(x)
    # two dilated convolutions, with dilation rates of i and 2i
    conv_1 = keras.layers.Conv1D(
        filters=num_filters,
        kernel_size=kernel_size,
        dilation_rate=i,
        padding=padding,
    )(x)
    conv_2 = keras.layers.Conv1D(
        filters=num_filters,
        kernel_size=kernel_size,
        dilation_rate=i * 2,
        padding=padding,
    )(x)
    concat = keras.layers.concatenate([conv_1, conv_2])
    x = keras.layers.Activation(activation)(concat)
    x = keras.layers.SpatialDropout1D(dropout_rate)(x)
    x = keras.layers.Conv1D(num_filters, 1, padding='same')(x)
    return keras.layers.add([res_x, x]), x


class TCN:
    def __init__(
        self,
        num_filters=20,
        kernel_size=5,
        dilations=[1, 2, 4, 8, 16, 32, 64, 128, 256],
        activation='elu',
        padding='same',
        dropout_rate=0.15,
        name='tcn',
    ):
        self.name = name
        self.dropout_rate = dropout_rate
        self.activation = activation
        self.dilations = dilations
        self.kernel_size = kernel_size
        self.num_filters = num_filters
        self.padding = padding

        if padding != 'causal' and padding != 'same':
            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")

    def __call__(self, inputs):
        x = inputs
        # gather skip connections, each having a different context
        skip_connections = []
        # build the TCN models
        for i in self.dilations:
            # feed the output of the previous layer into the next layer
            # increase dilation rate for each consecutive layer
            x, skip_out = residual_block(
                x, i, self.activation, 20, self.kernel_size, self.padding, self.dropout_rate, name=self.name
            )
            # collect skip connection
            skip_connections.append(skip_out)
        # activate the output of the TCN stack
        x = keras.layers.Activation(self.activation, name=self.name + '_activation')(x)
        # merge the skip connections by simply adding them
        skip = keras.layers.add(skip_connections, name=self.name + '_merge_skip_connections')
        return x, skip

def create_model(dropout_rate = 0.15, num_filters = 20):
  input = keras.layers.Input(shape = (1000, 81, 1))
  conv_1 = keras.layers.Conv2D(filters = num_filters, kernel_size = (3, 3), padding = "valid",)(input)
  conv_1 = keras.layers.Activation(activation = "elu")(conv_1)
  conv_1 = keras.layers.MaxPooling2D((1, 3))(conv_1)
  conv_1 = keras.layers.Dropout(rate = dropout_rate)(conv_1)

  conv_2 = keras.layers.Conv2D(filters = num_filters, kernel_size = (1, 10), padding = "valid",)(conv_1)
  conv_2 = keras.layers.Activation(activation = "elu")(conv_2)
  conv_2 = keras.layers.MaxPooling2D((1, 3))(conv_2)
  conv_2 = keras.layers.Dropout(rate = dropout_rate)(conv_2)

  conv_3 = keras.layers.Conv2D(filters = num_filters, kernel_size = (3, 3), padding = "valid",)(conv_2)
  conv_3 = keras.layers.Activation(activation = "elu")(conv_3)
  conv_3 = keras.layers.MaxPooling2D((1, 3))(conv_3)
  conv_3 = keras.layers.Dropout(rate = dropout_rate)(conv_3)

  x = keras.layers.Reshape((-1, num_filters))(conv_3)
  _, skip = TCN()(x)

  instr = keras.layers.Dropout(dropout_rate, name='output_dropout')(skip)
  instr = keras.layers.GlobalAveragePooling1D(name='output_global_average_pooling')(instr)
  instr = keras.layers.GaussianNoise(dropout_rate, name='output_noise')(instr)
  instr = keras.layers.Dense(20, name='output_dense')(instr)
  instr = keras.layers.Activation('softmax', name='output')(instr)

  return keras.Model(input, instr)



In [16]:
def custom_BCE(y_true, y_pred):
  mask = tf.math.not_equal(y_true, -1)
  y_true_masked = tf.boolean_mask(y_true, mask)
  y_pred_masked = tf.boolean_mask(y_pred, mask)
  y_pred_masked = tf.cast(y_pred_masked, dtype = tf.float64)
  y_true_masked = tf.cast(y_true_masked > 0.5, tf.float64)
  loss = -tf.reduce_mean(y_true_masked * tf.math.log(y_pred_masked + 1e-6) + (1 - y_true_masked) * tf.math.log(1 - y_pred_masked + 1e-6))
  return loss

In [50]:
class custom_ACC(tf.keras.metrics.Metric):
  def __init__(self):
    super().__init__()
    self.correct = self.add_weight(initializer = "zeros", dtype = tf.int32, name = "correct")
    self.total = self.add_weight(initializer = "zeros", dtype = tf.int32, name = "total")

  def update_state(self, y_true, y_pred, sample_weight = None):
    mask = tf.math.not_equal(y_true, -1)

    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    y_true_masked = tf.cast(y_true_masked > 0.5, tf.bool)
    y_pred_masked = tf.cast(y_pred_masked > 0.5, tf.bool)

    correct = tf.math.equal(y_true_masked, y_pred_masked)
    correct = tf.cast(correct, dtype = tf.int32)

    self.correct.assign_add(tf.reduce_sum(correct))
    self.total.assign_add(tf.size(correct))

  def result(self):
    return self.correct / self.total

  def reset_state(self):
    self.correct.assign(0)
    self.total.assign(0)

In [51]:
train_dataset = tf.data.Dataset.from_generator(train_generator, (tf.float64, tf.float64), ((1000, 81, 1,), (20,)))
valid_dataset = valid_generator()
train_dataset = train_dataset.batch(1)

In [None]:
model = create_model()
model.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.0001),
              loss = custom_BCE,
              metrics = [custom_ACC()])
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("instr_model.h5", save_best_only = True,
                                                   monitor = custom_ACC(),
                                                   mode = "min", verbose = 1)
earlystopping_cb = tf.keras.callbacks.EarlyStopping(patience = 10, min_delta = 1e-2)
model.fit(train_dataset, validation_data = valid_dataset, epochs = 150, callbacks = [checkpoint_cb, earlystopping_cb])

Epoch 1/150
  14915/Unknown - 3584s 239ms/step - loss: 1.3472 - custom_acc_26: 0.5888



Epoch 2/150
 2713/14915 [====>.........................] - ETA: 49:32 - loss: 1.2390 - custom_acc_26: 0.5765