In [1]:
import os
import glob
import librosa
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
from matplotlib import pylab, mlab, pyplot
import shutil
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, Activation, Reshape, Permute
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D
from tensorflow.keras.layers import GRU, LSTM
import tensorflow as tf
import random
import pathlib
import glob
import os
import random
import tensorflow as tf
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

Извлечение признаков (мел-спектр)

In [82]:
# предобработка данных
def data_preprocessing(audio_file, sr):
    audio, sr = librosa.load(audio_file, sr=sr, mono=True)
    if audio.ndim == 2:
        audio = audio.T
    audio = audio.astype(np.float32)
    max_val = np.max(np.abs(audio))
    if max_val > 0:
        audio /= max_val
    return audio


# создание транспонированной матрицы мел-спектра
def create_mel_spectrogram(audio_file, sr, hop_length, fragment_size_ms, layering):
    data = data_preprocessing(audio_file, sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_fft=512, hop_length=hop_length)
    t_spectrogram = np.transpose(librosa.power_to_db(mel_spectrogram, ref=np.max))
    return t_spectrogram


def save_spectrogram(data, filename):
    pylab.figure(figsize=(data.shape[1], data.shape[0]), dpi=1)
    pylab.axis('off') # no axis
    axes = pylab.gca()
    axes.set_ylim([-80.0,0])
    pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
    librosa.display.specshow(data)
    pylab.savefig(filename, bbox_inches=None, pad_inches=0, format="png")
    pylab.clf()
    pylab.close()


# использование всех аудиофайлов в директории
filepath = '../songs'
audio_files = glob.glob(os.path.join(filepath, '*.mp3')) + glob.glob(os.path.join(filepath, '*.wav'))

output_folder = 'spectrograms'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
else:
    shutil.rmtree(output_folder)
    print("Содержимое папки 'spectrograms' успешно удалено.")
    os.makedirs(output_folder)

print('Обработаны следующие файлы:')
for file in audio_files:
    print(os.path.basename(file))
    data = create_mel_spectrogram(file, 16000, 512, 1000, 1)

    filename = os.path.splitext(os.path.basename(file))[0]
    output_file = os.path.join(output_folder, f"{filename}.png")
    save_spectrogram(data, output_file)

print('Данные сохранены в png файлах.')

Содержимое папки 'spectrograms' успешно удалено.
Обработаны следующие файлы:
7 Seconds -- Youssou N'Dour, Neneh Cherry.mp3
a-ha -- Take On Me.mp3
ABBA -- Money, Money, Money.mp3
ABBA -- The Winner Takes It All.mp3
All The Things She Said.mp3
Animals — Martin Garrix.mp3
Another One Bites The Dust — Queen.mp3
Apologize.mp3
Appletree.mp3
B.o.B, Hayley Williams of Paramore -- Airplanes (feat. Hayley Williams of Paramore).mp3
B.o.B, Jessie J -- Price Tag.mp3
Bad Bad Boys.mp3
Bad Romance.mp3
Bag Raiders -- Shooting stars.mp3
Данные сохранены в png файлах.


Используемая модель

In [83]:
def CRNN2D(IMG_SHAPE, nb_classes):
    '''
    Model used for evaluation in paper. Inspired by K. Choi model in:
    https://github.com/keunwoochoi/music-auto_tagging-keras/blob/master/music_tagger_crnn.py
    '''

    nb_layers = 4  # number of convolutional layers
    nb_filters = [64, 128, 128, 128]  # filter sizes
    kernel_size = (3, 3)  # convolution kernel size
    activation = 'elu'  # activation function to use after each layer
    #pool_size = [(2, 2), (4, 2), (4, 2), (4, 2),
    #             (4, 2)]  # size of pooling area

    pool_size = [(2, 2), (2, 4), (2, 4), (2, 4),
                 (2, 4)]  # size of pooling area
    # shape of input data (frequency, time, channels)
    input_shape = (IMG_SHAPE[0], IMG_SHAPE[1], IMG_SHAPE[2])
    frequency_axis = 2
    time_axis      = 1
    channel_axis   = 3

    #print(input_shape)

    # Create sequential model and normalize along frequency axis
    model = Sequential()
    model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape))

    # First convolution layer specifies shape
    model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same',
                     data_format="channels_last",
                     input_shape=input_shape))
    model.add(Activation(activation))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(MaxPooling2D(pool_size=pool_size[0], strides=pool_size[0]))
    model.add(Dropout(0.1))

    # Add more convolutional layers
    for layer in range(nb_layers - 1):
        # Convolutional layer
        model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
                         padding='same'))
        model.add(Activation(activation))
        model.add(BatchNormalization(
            axis=channel_axis))  # Improves overfitting/underfitting
        model.add(MaxPooling2D(pool_size=pool_size[layer + 1],
                               strides=pool_size[layer + 1]))  # Max pooling
        model.add(Dropout(0.1))

        # Reshaping input for recurrent layer
    # (frequency, time, channels) --> (time, frequency, channel)
    model.add(Permute((time_axis, frequency_axis, channel_axis)))
    resize_shape = model.output_shape[2] * model.output_shape[3]
    model.add(Reshape((model.output_shape[1], resize_shape)))

    # recurrent layer
    model.add(GRU(32, return_sequences=True))
    model.add(GRU(32, return_sequences=False))
    model.add(Dropout(0.3))

    # Output layer
    model.add(Dense(nb_classes))
    model.add(Activation("softmax"))
    #print(model.output_shape)
    return model

Обучение модели

In [125]:
batch_size = 1
r_seed = 777

# получение спика изображений
filepath = 'spectrograms'
spectrogram_files = glob.glob(os.path.join(filepath, '*.png'))

# перемешивание списка изображений
random.Random(r_seed).shuffle(spectrogram_files)

# получение количества изображений (классов)
classes_number = len(audio_files)
print('Количество композиций:', classes_number)

# нумерация классов
label_to_index = dict((name, index) for index, name in enumerate(spectrogram_files))
# all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
#                             for path in spectrogram_files]
all_image_labels = list(label_to_index.values())
print(all_image_labels)

# one-hot-encoding
def to_uni(pos, lenof):
    res = [0. for i in range(lenof)]
    res[pos] = 1.
    return res

all_image_labels = [to_uni(i, classes_number) for i in all_image_labels]
print(all_image_labels)

# получение формы входного изображения.
print('Размер изображений:')
first_shape = tf.image.decode_png(tf.io.read_file(spectrogram_files[0])).shape
pixels = (first_shape[0], first_shape[1])
print(pixels)

# создание датасета
ds = tf.data.Dataset.from_tensor_slices((spectrogram_files,
                                                 all_image_labels))

def preprocess_image(image):
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, pixels)
    image = tf.cast(image, tf.float32) / 255.0
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)


def load_and_preprocess_from_path_label(path, label):
    return load_and_preprocess_image(path), label


image_label_ds = ds.map(load_and_preprocess_from_path_label)
inp_shape = image_label_ds.element_spec[0].shape
image_label_ds = image_label_ds.batch(batch_size)

train_size = int(0.70 * classes_number / np.float64(batch_size))
val_size   = int(0.15 * classes_number / np.float64(batch_size))
test_size  = int(0.15 * classes_number / np.float64(batch_size))

print("train_size: ", train_size, flush=True)
print("val_size: ", val_size, flush=True)
print("test_size: ",test_size, flush=True)

train_dataset = image_label_ds.take(train_size)
test_dataset = image_label_ds.skip(train_size)
val_dataset = test_dataset.take(val_size)
test_dataset  = test_dataset.skip(val_size)

Количество композиций: 14
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1

In [None]:
weights_folder = 'weights'
if not os.path.exists(weights_folder):
    os.makedirs(weights_folder)
else:
    shutil.rmtree(weights_folder)
    print("Содержимое папки 'weights' успешно удалено.")
    os.makedirs(weights_folder)

learning_rate = 0.01
epochs = 20

model = CRNN2D(inp_shape, nb_classes = classes_number)
model.compile(loss = 'categorical_crossentropy', optimizer = Adam(learning_rate=learning_rate), metrics = ['accuracy'])
model.summary()


checkpointer = ModelCheckpoint(filepath=weights_folder + "/model.keras", verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='loss', min_delta=1, patience=10, verbose=0, mode='auto')
callbacks = [checkpointer]
# if (is_early_stop):
callbacks.append(earlystopper)


history = model.fit(train_dataset, validation_data = val_dataset, epochs = epochs, batch_size = batch_size, callbacks = callbacks)
model.save("weights/model.keras")