In [2]:
import tensorflow as tf
from tensorflow import keras

# Imports from keras
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Lambda, Dense, RepeatVector
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.losses import mse

import numpy as np
import librosa

from os import walk
import os
import sys

import datetime

In [3]:
def rescale(x):
    return 1 + 2*np.maximum(0.1*np.log10(x + np.sqrt(np.finfo(float).eps)),-1) #add epsilon to avoid divide by zero

def inverseRescale(x):
    return np.power(10, (5*(x-1)))

In [4]:
class GetSounds(keras.utils.Sequence):
    def __init__(self, path, batch_size = 20000):
        self.file_names = []
        for (dirpath, dirnames, filenames) in walk(path):
            self.file_names = filenames
            break
        self.path = path
        self.batch_size = batch_size
        self.on_epoch_end()
        
    def on_epoch_end(self):
        np.random.shuffle(self.file_names)
        
    def __len__(self):
        return int(np.ceil(len(self.file_names) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_names = self.file_names[self.batch_size*idx:self.batch_size*(idx+1)]
        out = np.empty([self.batch_size,173, 513])
        for i,file_name in enumerate(batch_names):
            print('\r', 'read ', i, '/', self.batch_size, end='')
            y, sr = librosa.load(self.path + file_name)
            out[i] = np.abs(np.transpose(librosa.stft(y, n_fft = 1024, hop_length = 512)))
        return out.astype('float32',casting='same_kind')

In [5]:
x_train = GetSounds('nsynth-valid/audio/',batch_size=(8192 + 4096))[0]
x_test = GetSounds('nsynth-test/audio/',batch_size=4096)[0]

 read  4095 / 409688

In [6]:
# Reparameterization trick to push the N(0,1) into the back prop inputs
def sample(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[1]
    dim = K.int_shape(z_mean)[2]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [15]:
# Model Parameters
input_dim = 513
intermediate_dim = 64
latent_dim = 20

# Construct the model
x = Input((None, input_dim))

# LSTM Encoder
h = LSTM(intermediate_dim, return_sequences=True)(x)

# Generate Distribution
z_mu = LSTM(latent_dim, return_sequences=True)(h)
z_log_sigma = LSTM(latent_dim, return_sequences=True)(h)

# Sample from Distribution
z = Lambda(sample)([z_mu, z_log_sigma])

# decoded LSTM layer
decoder_h = LSTM(intermediate_dim, return_sequences=True)
decoder_mean = LSTM(input_dim, return_sequences=True)

h_decoded = decoder_h(z)

# decoded layer
x_bar = decoder_mean(h_decoded)

# Full Autoencoder
vae = Model(x, x_bar)

# Encoder, Input -> Latent Space
encoder = Model(x, z_mu)

# generator, from latent space to reconstructed inputs
decoder_input = Input(shape=(None, latent_dim))

_h_decoded = decoder_h(decoder_input)

_x_bar = decoder_mean(_h_decoded)
generator = Model(decoder_input, _x_bar)

In [16]:
# Setup cost functions
def loss(x, x_bar,beta):
    xent_loss = mse(x, x_bar)
    kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mu) - K.exp(z_log_sigma))
    loss = xent_loss + beta*kl_loss
    return loss
    
vae.add_loss(loss(x,x_bar,1))

In [17]:
# Finally, compile the model!
vae.compile(optimizer='rmsprop')

In [18]:
epochs = 10

vae.fit(x_train,epochs=epochs,validation_data=[x_test], verbose = 1,batch_size=512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4f88529dc0>

In [None]:
def vae_predict(input):
    inter = encoder.predict(np.reshape(input, [1, input.shape[0], input.shape[1]]))
    return generator.predict(inter)[0, :, :]

In [None]:
def apply(file):
    y, sr = librosa.load(file)
    S = np.transpose(librosa.stft(y, n_fft = 1024))
    S_pred = vae_predict(np.abs(S))*np.exp(1j*np.angle(S))
    out = librosa.istft(np.transpose(S_pred))
    librosa.output.write_wav('out2.wav', out, sr, norm = True)

In [None]:
apply('bass_lo.wav')