In [None]:
import tensorflow as tf
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam, RMSprop
import matplotlib.pyplot as plt
import numpy as np
import os
import librosa
from tqdm import tqdm
from scipy.io.wavfile import write

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
img_rows = 100
img_cols = 44
channels = 1
img_shape = (img_rows, img_cols, channels)
latent_dim = 50
_min = 0
_max = 3140.9622

In [None]:
X_train = np.load('drum-data.npy')

In [None]:
def build_generator():
    model = Sequential()
    model.add(Dense(128 * 25 * 11, activation='relu', input_dim=latent_dim))
    model.add(Reshape((25, 11, 128)))
    model.add(UpSampling2D())
    model.add(Conv2D(128, kernel_size=3, padding='same'))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Activation('relu'))
    model.add(UpSampling2D())
    model.add(Conv2D(64, kernel_size=3, padding='same'))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Activation('relu'))
    model.add(Conv2D(channels, kernel_size=3, padding='same'))
    model.add(Activation('tanh'))

    return model

In [None]:
def build_discriminator():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=3, strides=2, input_shape=img_shape, padding='same'))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(Conv2D(64, kernel_size=3, strides=2, padding='same'))
    model.add(ZeroPadding2D(padding=((0,1),(0,1))))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(Conv2D(128, kernel_size=3, strides=2, padding='same'))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(Conv2D(256, kernel_size=3, strides=1, padding='same'))
    model.add(BatchNormalization(momentum=0.8))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    return model

In [None]:
def train(X_train, gan_model, discriminator, epochs, batch_size=128, save_interval=100, progress_interval=1):
    # prepare target outpus
    ones = np.ones((batch_size, 1))
    zeros = np.zeros((batch_size, 1))

    for epoch in range(epochs):
        # Select random images
        index = np.random.randint(0, X_train.shape[0], batch_size)
        real_images = X_train[index]

        # Sample noise and generate a batch of new images
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        generated_images = generator.predict(noise)
        
        # Train the discriminator (real classified as ones and generated as zeros)
        d_loss_real = discriminator.train_on_batch(real_images, ones)
        d_loss_fake = discriminator.train_on_batch(generated_images, zeros)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train the generator 
        g_loss = gan_model.train_on_batch(noise, ones)

        if epoch % progress_interval == 0:
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

        if epoch % save_interval == 0:
            save_images(epoch)
            
        if epoch % (save_interval * 2) == 0:
            save_audio(epoch)
            
    return gan_model, discriminator

In [None]:
def save_images(epoch):
    rows, cols = 3, 3
    noise = np.random.normal(0, 1, (rows * cols, latent_dim))
    gen_imgs = generator.predict(noise)

    fig, axs = plt.subplots(rows, cols)
    cnt = 0
    for i in range(rows):
        for j in range(cols):
            axs[i,j].imshow(gen_imgs[cnt, :,:,0], cmap='hot')
            axs[i,j].axis('off')
            cnt += 1
    fig.savefig("images/drums_%d.png" % epoch)
    plt.close()

In [None]:
def save_audio(epoch):
    noise = np.random.normal(0, 1, (1, latent_dim))
    generated_images = generator.predict(noise)
    generated_image = np.squeeze(generated_images[0])
    audio = librosa.feature.inverse.mel_to_audio(denormalize(generated_image), sr=22050)
    write('audio/drums_' + str(epoch) + '.wav', 22050, audio)

In [None]:
def denormalize(array):
    return (array * (_max - _min)) + _min

In [None]:
generator = build_generator()
discriminator = build_discriminator()
d_optimizer = Adam(lr=0.0001, decay=1e-8)
discriminator.compile(loss='binary_crossentropy', optimizer=d_optimizer, metrics=['accuracy'])
discriminator.trainable = False
gan_model = Sequential()
gan_model.add(generator)
gan_model.add(discriminator)
gan_optimizer = Adam(lr=0.001)
gan_model.compile(loss='binary_crossentropy', optimizer=gan_optimizer)

In [None]:
gan_model, discriminator = train(X_train, gan_model, discriminator, epochs=5001, batch_size=32, save_interval=150, progress_interval=50)

In [None]:
noise = np.random.normal(0, 1, (1, latent_dim))
generated_images = generator.predict(noise)
generated_image = np.squeeze(generated_images)
plt.imshow(generated_image, cmap='hot')

test = librosa.feature.inverse.mel_to_audio(denormalize(generated_image), sr=22050)
write('test.wav', 22050, test)

In [None]:
a = np.squeeze(X_train)
plt.imshow(a[4], cmap='hot')