<a href="https://colab.research.google.com/github/jeffreyroh2002/Music-Descriptify/blob/main/tempo_estimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ghp-import
!pip install mirdata>=0.3.0
!pip install librosa
!pip install cython
!pip install madmom
!pip install mir_eval
!pip install tqdm

In [None]:
!pip install collections

[31mERROR: Could not find a version that satisfies the requirement collections (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for collections[0m[31m
[0m

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import (
    Activation,
    Dense,
    Input,
    Conv1D,
    Conv2D,
    MaxPooling2D,
    Reshape,
    Dropout,
    SpatialDropout1D,
    GaussianNoise,
    GlobalAveragePooling1D,
)

****Tempo, Beat and Downbeat Estimation****

-Downloading gtzan dataset

In [None]:
import mirdata

# gtzan = mirdata.initialize('gtzan_genre', version = 'mini')
# gtzan.download()
# gtzan.validate()
# len(gtzan.track_ids)

# obtain copy of GTZAN data (use a mirror URL for faster access)
!pip install gdown
!mkdir -p /root/mir_datasets/gtzan_genre/gtzan_genre/
!gdown --id 1cwb2vAKryAYqkP2D86bpO6mbVlQmVhjN --output /root/mir_datasets/gtzan_genre/gtzan_genre/genres.tar.gz
# use the following line to initialise the dataset (i.e. the full version without 'mini')
gtzan = mirdata.initialize('gtzan_genre')
gtzan.download()
len(gtzan.track_ids)

Downloading...
From: https://drive.google.com/uc?id=1cwb2vAKryAYqkP2D86bpO6mbVlQmVhjN
To: /root/mir_datasets/gtzan_genre/gtzan_genre/genres.tar.gz
100% 1.23G/1.23G [00:14<00:00, 84.8MB/s]


632kB [00:00, 1.41MB/s]


1000

-Define dataset splits

In [None]:
from sklearn.model_selection import train_test_split

tracks = gtzan.load_tracks()
train_files, test_files = train_test_split(list(tracks.keys()), test_size = 0.2, random_state = 42)

-Audio pre-processing

In [None]:
from madmom.processors import ParallelProcessor, SequentialProcessor
from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.stft import ShortTimeFourierTransformProcessor
from madmom.audio.spectrogram import FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor
import numpy as np

FPS = 100
FFT_SIZE = 2048
NUM_BANDS = 12

class PreProcessor(SequentialProcessor):
  def __init__(self, frame_size = FFT_SIZE, num_bands = NUM_BANDS, log = np.log, add = 1e-6, fps = FPS):
    #The signalProcessor class is a basic signal processor
    #it works like a librosa.load function
    sig = SignalProcessor(num_channels = 1, sample_rate = 44100)
    frames = FramedSignalProcessor(frame_size = frame_size, fps = fps)
    stft = ShortTimeFourierTransformProcessor()
    filt = FilteredSpectrogramProcessor(num_bands = num_bands)
    spec = LogarithmicSpectrogramProcessor(log = log, add = add)
    super(PreProcessor, self).__init__((sig, frames, stft, filt, spec, np.array))
    self.fps = fps

In [None]:
def residual_block(x, i, activation, num_filters, kernel_size, padding, dropout_rate=0, name=''):
    # name of the layer
    name = name + '_dilation_%d' % i
    # 1x1 conv. of input (so it can be added as residual)
    res_x = Conv1D(num_filters, 1, padding='same', name=name + '_1x1_conv_residual')(x)
    # two dilated convolutions, with dilation rates of i and 2i
    conv_1 = Conv1D(
        filters=num_filters,
        kernel_size=kernel_size,
        dilation_rate=i,
        padding=padding,
        name=name + '_dilated_conv_1',
    )(x)
    conv_2 = Conv1D(
        filters=num_filters,
        kernel_size=kernel_size,
        dilation_rate=i * 2,
        padding=padding,
        name=name + '_dilated_conv_2',
    )(x)
    # concatenate the output of the two dilations
    concat = keras.layers.concatenate([conv_1, conv_2], name=name + '_concat')
    # apply activation function
    x = Activation(activation, name=name + '_activation')(concat)
    # apply spatial dropout
    x = SpatialDropout1D(dropout_rate, name=name + '_spatial_dropout_%f' % dropout_rate)(x)
    # 1x1 conv. to obtain a representation with the same size as the residual
    x = Conv1D(num_filters, 1, padding='same', name=name + '_1x1_conv')(x)
    # add the residual to the processed data and also return it as skip connection
    return keras.layers.add([res_x, x], name=name + '_merge_residual'), x


class TCN:
    def __init__(
        self,
        num_filters=20,
        kernel_size=5,
        dilations=[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024],
        activation='elu',
        padding='same',
        dropout_rate=0.15,
        name='tcn',
    ):
        self.name = name
        self.dropout_rate = dropout_rate
        self.activation = activation
        self.dilations = dilations
        self.kernel_size = kernel_size
        self.num_filters = num_filters
        self.padding = padding

        if padding != 'causal' and padding != 'same':
            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")

    def __call__(self, inputs):
        x = inputs
        # gather skip connections, each having a different context
        skip_connections = []
        # build the TCN models
        for i, num_filters in zip(self.dilations, self.num_filters):
            # feed the output of the previous layer into the next layer
            # increase dilation rate for each consecutive layer
            x, skip_out = residual_block(
                x, i, self.activation, num_filters, self.kernel_size, self.padding, self.dropout_rate, name=self.name
            )
            # collect skip connection
            skip_connections.append(skip_out)
        # activate the output of the TCN stack
        x = Activation(self.activation, name=self.name + '_activation')(x)
        # merge the skip connections by simply adding them
        skip = keras.layers.add(skip_connections, name=self.name + '_merge_skip_connections')
        return x, skip

In [None]:
def create_model(input_shape, num_filters=20, num_dilations=11, kernel_size=5, activation='elu', dropout_rate=0.15):
    # input layer
    input_layer = Input(shape=input_shape)

    # stack of 3 conv layers, each conv, activation, max. pooling & dropout
    conv_1 = Conv2D(num_filters, (3, 3), padding='valid', name='conv_1_conv')(input_layer)
    conv_1 = Activation(activation, name='conv_1_activation')(conv_1)
    conv_1 = MaxPooling2D((1, 3), name='conv_1_max_pooling')(conv_1)
    conv_1 = Dropout(dropout_rate, name='conv_1_dropout')(conv_1)

    conv_2 = Conv2D(num_filters, (1, 10), padding='valid', name='conv_2_conv')(conv_1)
    conv_2 = Activation(activation, name='conv_2_activation')(conv_2)
    conv_2 = MaxPooling2D((1, 3), name='conv_2_max_pooling')(conv_2)
    conv_2 = Dropout(dropout_rate, name='conv_2_dropout')(conv_2)

    conv_3 = Conv2D(num_filters, (3, 3), padding='valid', name='conv_3_conv')(conv_2)
    conv_3 = Activation(activation, name='conv_3_activation')(conv_3)
    conv_3 = MaxPooling2D((1, 3), name='conv_3_max_pooling')(conv_3)
    conv_3 = Dropout(dropout_rate, name='conv_3_dropout')(conv_3)

    # reshape layer to reduce dimensions
    x = Reshape((-1, num_filters), name='tcn_input_reshape')(conv_3)

    # TCN layers
    dilations = [2 ** i for i in range(num_dilations)]
    tcn, skip = TCN(
        num_filters=[num_filters] * len(dilations),
        kernel_size=kernel_size,
        dilations=dilations,
        activation=activation,
        padding='same',
        dropout_rate=dropout_rate,
    )(x)

    # output layers; beats & downbeats use TCN output, tempo the skip connections

    tempo = Dropout(dropout_rate, name='tempo_dropout')(skip)
    tempo = GlobalAveragePooling1D(name='tempo_global_average_pooling')(tempo)
    tempo = GaussianNoise(dropout_rate, name='tempo_noise')(tempo)
    tempo = Dense(250, name='tempo_dense')(tempo)
    tempo = Activation('softmax', name='tempo')(tempo)

    # instantiate a Model and return it
    return Model(input_layer, outputs=tempo)

In [None]:
import os
import sys
import warnings
from keras.utils import Sequence
import madmom

MASK_VALUE = -1

class DataSequence(Sequence):
  def __init__(self, tracks, pre_processor, num_tempo_bins = 250, pad_frames = None):
    self.x = {}
    self.tempo = {}
    self.pad_frames = pad_frames
    self.ids = []

    for i, key in enumerate(tracks):
      sys.stderr.write(f'\rprocessing track {i + 1}/{len(tracks)}: {key + " " * 20}')
      sys.stderr.flush()
      t = tracks[key]
      try:
        beats = t.beats.times
        s = madmom.audio.Signal(*t.audio)
        tempo = t.tempo
        tempo = keras.utils.to_categorical(int(np.round(tempo)), num_classes=num_tempo_bins, dtype='float32')
        tempo = tf.constant(tempo)
        tempo = tf.expand_dims(tempo, axis = 0)
        self.tempo[key] = tempo
        x = pre_processor(s)
        x = tf.constant(x)
        x = tf.expand_dims(x, axis = 0)
        x = tf.expand_dims(x, axis = -1)
        self.x[key] = x
        self.ids.append(key)
      except AttributeError:
        print(f'\r{key} has no tempo information, skipping\n')
        continue
      except IndexError:
        continue
      assert len(self.x) == len(self.tempo) == len(self.ids)

  def __len__(self):
    return len(self.ids)

  def __getitem__(self, idx):
    if isinstance(idx, int):
      idx = self.ids[idx]
      x = self.x[idx]
      y = self.tempo[idx]
      return x, y

In [None]:
pad_frames = 2
pre_processor = PreProcessor()

train = DataSequence(
    tracks={k: v for k, v in tracks.items() if k in train_files}, pre_processor=pre_processor, pad_frames=pad_frames
)
test = DataSequence(
    tracks={k: v for k, v in tracks.items() if k in test_files}, pre_processor=pre_processor, pad_frames=pad_frames
)

processing track 633/800: reggae.00088                    

reggae.00086 has no tempo information, skipping



processing track 200/200: metal.00098                    

In [None]:
model = create_model(input_shape = train[0][0].shape[-3:])
model.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.0001), loss = keras.losses.BinaryCrossentropy(), metrics = ["accuracy",])
model.fit(train, epochs = 100, shuffle = True, validation_split = 0.2)

ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found following types in the input: [<class '__main__.DataSequence'>]

In [None]:
model.save("tempo.h5")

  saving_api.save_model(


In [None]:
model.evaluate(test)



[0.024480147287249565, 0.38499999046325684]

In [None]:
test[0]

In [None]:
processor = PreProcessor()
for i in range(20):
  id = test.ids[i]
  tempo = tracks[id].tempo
  prediction = model.predict(test[i][0])
  prediction = tf.argmax(prediction, axis = -1)
  print(prediction)
  print(tempo)
  print()
  print()

tf.Tensor([104], shape=(1,), dtype=int64)
103.44


tf.Tensor([63], shape=(1,), dtype=int64)
92.66


tf.Tensor([132], shape=(1,), dtype=int64)
131.99


tf.Tensor([61], shape=(1,), dtype=int64)
119.58


tf.Tensor([52], shape=(1,), dtype=int64)
65.67


tf.Tensor([69], shape=(1,), dtype=int64)
81.81


tf.Tensor([97], shape=(1,), dtype=int64)
87.37


tf.Tensor([52], shape=(1,), dtype=int64)
55.43


tf.Tensor([117], shape=(1,), dtype=int64)
115.95


tf.Tensor([120], shape=(1,), dtype=int64)
120.51


tf.Tensor([95], shape=(1,), dtype=int64)
93.59


tf.Tensor([120], shape=(1,), dtype=int64)
127.52


tf.Tensor([64], shape=(1,), dtype=int64)
124.09


tf.Tensor([162], shape=(1,), dtype=int64)
159.76


tf.Tensor([120], shape=(1,), dtype=int64)
117.3


tf.Tensor([172], shape=(1,), dtype=int64)
171.9


tf.Tensor([52], shape=(1,), dtype=int64)
65.46


tf.Tensor([101], shape=(1,), dtype=int64)
100.45


tf.Tensor([104], shape=(1,), dtype=int64)
104.39


tf.Tensor([139], shape=(1,), dtype=int64)
133.12


In [None]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [None]:
!git config -- global user.email "dongdong1653@gmail.com"
!git config -- global user.name "Joey-tpop"

error: key does not contain a section: global
error: key does not contain a section: global
