<a href="https://colab.research.google.com/github/jeffreyroh2002/Music-Descriptify/blob/main/tempo_estimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ghp-import
!pip install mirdata>=0.3.0
!pip install librosa
!pip install cython
!pip install madmom
!pip install mir_eval
!pip install tqdm

import mirdata

!git clone https://github.com/TempoBeatDownbeat/gtzan_tempo_beat
!tar -xzvf drive/MyDrive/instr_classification/genres.tar.gz

Collecting ghp-import
  Downloading ghp_import-2.1.0-py3-none-any.whl (11 kB)
Installing collected packages: ghp-import
Successfully installed ghp-import-2.1.0
Collecting madmom
  Downloading madmom-0.16.1.tar.gz (20.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: madmom
  Building wheel for madmom (setup.py) ... [?25l[?25hdone
  Created wheel for madmom: filename=madmom-0.16.1-cp310-cp310-linux_x86_64.whl size=21402796 sha256=c0e980f6cc8764a525b7c2955ef7d37e3832f4af348766a51ea6e77ffd17a632
  Stored in directory: /root/.cache/pip/wheels/87/18/20/105f9248e5f504f5ab190338516558cacaf6d5d7fadd5e7947
Successfully built madmom
Installing collected packages: madmom
Successfully installed madmom-0.16.1
Cloning into 'gtzan_tempo_beat'...
remote: Enumerating objects: 1936, done.[K
remote: Counting objects: 100% (1936/1

In [7]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import (
    Activation,
    Dense,
    Input,
    Conv1D,
    Conv2D,
    MaxPooling2D,
    Reshape,
    Dropout,
    SpatialDropout1D,
    GaussianNoise,
    GlobalAveragePooling1D,
)
import glob
import madmom
import numpy as np
import librosa
import random
from keras.utils import Sequence

In [25]:
wav_files = glob.glob("genres/*/*.wav")
random.shuffle(wav_files)
train_files = wav_files[:int(0.8 * len(wav_files))]
valid_files = wav_files[int(0.8 * len(wav_files)): int(0.9 * len(wav_files))]
test_files = wav_files[int(0.9 * len(wav_files)):]
tempo_files = glob.glob("gtzan_tempo_beat/tempo/*.bpm")
key_list = [filename.split('/')[-1] for filename in wav_files]
print(key_list)
train_keys = key_list[:int(0.8 * len(key_list))]
valid_keys = key_list[int(0.8 * len(key_list)):int(0.9 * len(key_list))]
test_keys = key_list[int(0.9 * len(key_list)):]
print(len(train_keys), len(valid_keys))

['country.00098.wav', 'classical.00066.wav', 'disco.00071.wav', 'metal.00017.wav', 'reggae.00085.wav', 'jazz.00096.wav', 'reggae.00042.wav', 'country.00021.wav', 'rock.00035.wav', 'rock.00025.wav', 'jazz.00033.wav', 'rock.00010.wav', 'rock.00044.wav', 'disco.00027.wav', 'reggae.00078.wav', 'blues.00050.wav', 'classical.00063.wav', 'country.00047.wav', 'classical.00053.wav', 'metal.00063.wav', 'jazz.00094.wav', 'country.00003.wav', 'pop.00086.wav', 'country.00042.wav', 'metal.00029.wav', 'metal.00070.wav', 'jazz.00006.wav', 'pop.00094.wav', 'rock.00036.wav', 'pop.00040.wav', 'classical.00085.wav', 'blues.00005.wav', 'reggae.00082.wav', 'blues.00032.wav', 'classical.00034.wav', 'hiphop.00040.wav', 'jazz.00069.wav', 'blues.00034.wav', 'rock.00071.wav', 'jazz.00046.wav', 'disco.00033.wav', 'disco.00087.wav', 'reggae.00002.wav', 'rock.00080.wav', 'pop.00096.wav', 'classical.00024.wav', 'country.00064.wav', 'classical.00026.wav', 'metal.00077.wav', 'jazz.00095.wav', 'hiphop.00087.wav', 'blue

In [23]:
s = madmom.audio.Signal(*librosa.load(wav_files[0]))
print(s.shape)
processor = PreProcessor()
processor(s).shape

(661794,)


(3002, 81)

In [43]:
class DataSequence(Sequence):
  def __init__(self, key_list, pre_processor = PreProcessor(), num_tempo_bins= 250):
    self.x = {}
    self.tempo = {}
    self.ids = []
    for i, key in enumerate(key_list):
      genre = key.split('.')[-3]
      audio_filepath = "genres/" + genre + "/" + key
      tempo_filepath = "gtzan_tempo_beat/tempo/gtzan_" + genre + "_" + key.split(".")[1] + ".bpm"
      try:
        if i % 100 == 0:
          print(f"{i}th file processing...")
        s = madmom.audio.Signal(*librosa.load(audio_filepath))
        with open(tempo_filepath, "r") as f:
          tempo = float(f.read())
          if tempo >= 250:
            continue
        tempo = int(round(tempo))
        tempo = keras.utils.to_categorical(tempo, num_classes=num_tempo_bins, dtype='float32')
        tempo = tf.constant(tempo)
        tempo = tf.expand_dims(tempo, axis = 0)
        self.tempo[key] = tempo

        x = pre_processor(s)
        x = tf.constant(x, dtype = tf.float64)
        x = tf.expand_dims(x, axis = 0)
        x = tf.expand_dims(x, axis = -1)
        self.x[key] = x

        self.ids.append(key)

      except:
        continue

      assert len(self.x) == len(self.tempo) == len(self.ids)

  def __len__(self):
    return len(self.ids)

  def __getitem__(self, idx):
    if isinstance(idx, int):
      idx = self.ids[idx]
      x = self.x[idx]
      y = self.tempo[idx]
      return x, y

In [44]:
train = DataSequence(key_list = train_keys)
valid = DataSequence(key_list = valid_keys)

0th file processing...
100th file processing...
200th file processing...
300th file processing...
400th file processing...
500th file processing...
600th file processing...
700th file processing...
0th file processing...


In [21]:
from madmom.processors import ParallelProcessor, SequentialProcessor
from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.stft import ShortTimeFourierTransformProcessor
from madmom.audio.spectrogram import FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor
import numpy as np

FPS = 100
FFT_SIZE = 2048
NUM_BANDS = 12

class PreProcessor(SequentialProcessor):
  def __init__(self, frame_size = FFT_SIZE, num_bands = NUM_BANDS, log = np.log, add = 1e-6, fps = FPS):
    #The signalProcessor class is a basic signal processor
    #it works like a librosa.load function
    sig = SignalProcessor(num_channels = 1, sample_rate = 44100)
    frames = FramedSignalProcessor(frame_size = frame_size, fps = fps)
    stft = ShortTimeFourierTransformProcessor()
    filt = FilteredSpectrogramProcessor(num_bands = num_bands)
    spec = LogarithmicSpectrogramProcessor(log = log, add = add)
    super(PreProcessor, self).__init__((sig, frames, stft, filt, spec, np.array))
    self.fps = fps

In [27]:
def residual_block(x, i, activation, num_filters, kernel_size, padding, dropout_rate=0, name=''):
    # name of the layer
    name = name + '_dilation_%d' % i
    # 1x1 conv. of input (so it can be added as residual)
    res_x = Conv1D(num_filters, 1, padding='same', name=name + '_1x1_conv_residual')(x)
    # two dilated convolutions, with dilation rates of i and 2i
    conv_1 = Conv1D(
        filters=num_filters,
        kernel_size=kernel_size,
        dilation_rate=i,
        padding=padding,
        name=name + '_dilated_conv_1',
    )(x)
    conv_2 = Conv1D(
        filters=num_filters,
        kernel_size=kernel_size,
        dilation_rate=i * 2,
        padding=padding,
        name=name + '_dilated_conv_2',
    )(x)
    # concatenate the output of the two dilations
    concat = keras.layers.concatenate([conv_1, conv_2], name=name + '_concat')
    # apply activation function
    x = Activation(activation, name=name + '_activation')(concat)
    # apply spatial dropout
    x = SpatialDropout1D(dropout_rate, name=name + '_spatial_dropout_%f' % dropout_rate)(x)
    # 1x1 conv. to obtain a representation with the same size as the residual
    x = Conv1D(num_filters, 1, padding='same', name=name + '_1x1_conv')(x)
    # add the residual to the processed data and also return it as skip connection
    return keras.layers.add([res_x, x], name=name + '_merge_residual'), x


class TCN:
    def __init__(
        self,
        num_filters=20,
        kernel_size=5,
        dilations=[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024],
        activation='elu',
        padding='same',
        dropout_rate=0.15,
        name='tcn',
    ):
        self.name = name
        self.dropout_rate = dropout_rate
        self.activation = activation
        self.dilations = dilations
        self.kernel_size = kernel_size
        self.num_filters = num_filters
        self.padding = padding

        if padding != 'causal' and padding != 'same':
            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")

    def __call__(self, inputs):
        x = inputs
        # gather skip connections, each having a different context
        skip_connections = []
        # build the TCN models
        for i, num_filters in zip(self.dilations, self.num_filters):
            # feed the output of the previous layer into the next layer
            # increase dilation rate for each consecutive layer
            x, skip_out = residual_block(
                x, i, self.activation, num_filters, self.kernel_size, self.padding, self.dropout_rate, name=self.name
            )
            # collect skip connection
            skip_connections.append(skip_out)
        # activate the output of the TCN stack
        x = Activation(self.activation, name=self.name + '_activation')(x)
        # merge the skip connections by simply adding them
        skip = keras.layers.add(skip_connections, name=self.name + '_merge_skip_connections')
        return x, skip

In [28]:
def create_model(num_filters=20, num_dilations=11, kernel_size=5, activation='elu', dropout_rate=0.15):
    # input layer
    input_layer = Input(shape=(3000, 81, 1))

    # stack of 3 conv layers, each conv, activation, max. pooling & dropout
    conv_1 = Conv2D(num_filters, (3, 3), padding='valid', name='conv_1_conv')(input_layer)
    conv_1 = Activation(activation, name='conv_1_activation')(conv_1)
    conv_1 = MaxPooling2D((1, 3), name='conv_1_max_pooling')(conv_1)
    conv_1 = Dropout(dropout_rate, name='conv_1_dropout')(conv_1)

    conv_2 = Conv2D(num_filters, (1, 10), padding='valid', name='conv_2_conv')(conv_1)
    conv_2 = Activation(activation, name='conv_2_activation')(conv_2)
    conv_2 = MaxPooling2D((1, 3), name='conv_2_max_pooling')(conv_2)
    conv_2 = Dropout(dropout_rate, name='conv_2_dropout')(conv_2)

    conv_3 = Conv2D(num_filters, (3, 3), padding='valid', name='conv_3_conv')(conv_2)
    conv_3 = Activation(activation, name='conv_3_activation')(conv_3)
    conv_3 = MaxPooling2D((1, 3), name='conv_3_max_pooling')(conv_3)
    conv_3 = Dropout(dropout_rate, name='conv_3_dropout')(conv_3)

    # reshape layer to reduce dimensions
    x = Reshape((-1, num_filters), name='tcn_input_reshape')(conv_3)

    # TCN layers
    dilations = [2 ** i for i in range(num_dilations)]
    tcn, skip = TCN(
        num_filters=[num_filters] * len(dilations),
        kernel_size=kernel_size,
        dilations=dilations,
        activation=activation,
        padding='same',
        dropout_rate=dropout_rate,
    )(x)

    # output layers; beats & downbeats use TCN output, tempo the skip connections

    tempo = Dropout(dropout_rate, name='tempo_dropout')(skip)
    tempo = GlobalAveragePooling1D(name='tempo_global_average_pooling')(tempo)
    tempo = GaussianNoise(dropout_rate, name='tempo_noise')(tempo)
    tempo = Dense(250, name='tempo_dense')(tempo)
    tempo = Activation('softmax', name='tempo')(tempo)

    # instantiate a Model and return it
    return Model(input_layer, outputs=tempo)

In [None]:
#train_dataset = tf.data.Dataset.from_generator(train_dataset_generator, (tf.float64, tf.float64), ((3000, 81, 1), (250)))
#train_dataset = train_dataset.batch(1, drop_remainder = True)
#valid_dataset = validation_dataset_generator()

tempo_model = keras.models.load_model("model.h5")
tempo_model.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.00005),
                    loss = keras.losses.BinaryCrossentropy(),
                    metrics = ["accuracy", keras.metrics.CategoricalAccuracy()])

checkpoint_cb = keras.callbacks.ModelCheckpoint("model.h5")
early_stopping_cb = keras.callbacks.EarlyStopping(min_delta = 1e-3, patience = 50)
tempo_model.fit(train, epochs = 150, shuffle = True,
                validation_data = valid, callbacks = [checkpoint_cb, early_stopping_cb])
tempo_model.save("model.h5")

In [49]:
tempo_model = keras.models.load_model("model.h5")
tempo_model.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.000005),
                    loss = keras.losses.BinaryCrossentropy(),
                    metrics = ["accuracy"])

checkpoint_cb = keras.callbacks.ModelCheckpoint("model.h5")
early_stopping_cb = keras.callbacks.EarlyStopping(min_delta = 1e-3, patience = 50)
tempo_model.fit(train, epochs = 50, shuffle = True,
                validation_data = valid, callbacks = [checkpoint_cb, early_stopping_cb])
tempo_model.save("model.h5")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50

KeyboardInterrupt: 

In [51]:
pre_processor = PreProcessor()
for key in test_keys:
  genre = key.split('.')[-3]
  audio_filepath = "genres/" + genre + "/" + key
  tempo_filepath = "gtzan_tempo_beat/tempo/gtzan_" + genre + "_" + key.split(".")[1] + ".bpm"
  s = madmom.audio.Signal(*librosa.load(audio_filepath))
  x = pre_processor(s)
  x = tf.constant(x, dtype = tf.float64)[:3000][:]
  x = tf.expand_dims(x, axis = 0)
  x = tf.expand_dims(x, axis = -1)
  with open(tempo_filepath, "r") as f:
    tempo = f.read()
  prediction = tempo_model.predict(x)
  prediction = tf.argmax(prediction, axis = -1).numpy()
  print("prediction: ", prediction, "true value: ", tempo)

prediction:  [116] true value:  3.843999999999999773e+01

prediction:  [73] true value:  1.449799999999999898e+02

prediction:  [59] true value:  8.064000000000000057e+01

prediction:  [132] true value:  1.316299999999999955e+02

prediction:  [105] true value:  1.055699999999999932e+02

prediction:  [95] true value:  9.504999999999999716e+01

prediction:  [125] true value:  6.199000000000000199e+01

prediction:  [118] true value:  1.174399999999999977e+02

prediction:  [100] true value:  9.679999999999999716e+01

prediction:  [140] true value:  1.423600000000000136e+02

prediction:  [100] true value:  1.002900000000000063e+02

prediction:  [131] true value:  1.309300000000000068e+02

prediction:  [117] true value:  1.169500000000000028e+02

prediction:  [73] true value:  1.423899999999999864e+02

prediction:  [158] true value:  7.948000000000000398e+01

prediction:  [118] true value:  2.376800000000000068e+02

prediction:  [97] true value:  6.437000000000000455e+01

prediction:  [125] 