In [None]:
import numpy as np
import keras
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Reshape, Conv2D, MaxPooling2D, UpSampling2D
from keras import optimizers
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import tensorflow as tf
import librosa
from librosa import display
from librosa.output import write_wav

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [None]:
X_train = np.load('./data/yes-data-2d.npy')
_max = np.amax(X_train)

# normalize
X_train = X_train / _max
X_train = np.expand_dims(X_train, axis=3)

In [None]:
def denormalize(array):
    return array * _max

In [None]:
noise_factor = 0.04
X_train_noisy = X_train + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=X_train.shape) 

X_train_noisy = np.clip(X_train_noisy, 0., 1.)

In [None]:
autoencoder = Sequential()

autoencoder.add(Conv2D(128, (3, 3), activation='relu', padding='same', input_shape=(128, 24, 1)))
autoencoder.add(MaxPooling2D((2, 2), padding='same'))

autoencoder.add(Conv2D(128, (3, 3), activation='relu', padding='same'))

autoencoder.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
autoencoder.add(UpSampling2D((2, 2)))

autoencoder.add(Conv2D(1, (3, 3), activation='sigmoid', padding='same'))

optim = Adam(lr=.002)
autoencoder.compile(optimizer=optim, loss='binary_crossentropy')

In [None]:
autoencoder.fit(X_train_noisy, X_train, epochs=100, batch_size=256, validation_split=.1)

In [None]:
librosa.display.specshow(X_train_noisy[8].reshape(128, 24))

In [None]:
data = X_train[8]
data = np.expand_dims(data, axis=0)
data = autoencoder.predict(data) 
data = data.reshape(128, 24)
librosa.display.specshow(data)
reconstructed = librosa.feature.inverse.mel_to_audio(denormalize(data), sr=12000, power=.5)
write_wav('reconstructed.wav', sr=12000, y=reconstructed)

In [None]:
data = X_train[8].reshape(128, 24)
librosa.display.specshow(data)
real = librosa.feature.inverse.mel_to_audio(denormalize(data), sr=12000, power=.5)
write_wav('real.wav', sr=12000, y=real)

In [None]:
data, rate = librosa.load('./yes_p_generated6.wav', duration=1, sr=12000,)
spectrogram = librosa.feature.melspectrogram(y=data, sr=rate, n_mels=128, power=.5)
librosa.display.specshow(spectrogram)

In [None]:
spectrogram = np.expand_dims(spectrogram, axis=0)
spectrogram = np.expand_dims(spectrogram, axis=3)
output = autoencoder.predict(spectrogram)
output = np.squeeze(output)
librosa.display.specshow(output)
output = librosa.feature.inverse.mel_to_audio(denormalize(output), sr=12000, power=.5)
write_wav('test.wav', sr=12000, y=output)