## Spoken GAN

### A Generative Adversarial Network (GAN) for Spoken Digits

In [None]:
# SOURCE SPOKEN DIGIT DATASET

!git clone https://github.com/Jakobovski/free-spoken-digit-dataset.git

In [9]:
# GET FILE NAMES OF SAMPLES

from os import listdir, getcwd
from os.path import isfile, join

data_dir = join(getcwd(), 'free-spoken-digit-dataset', 'recordings')

file_names = [f for f in listdir(data_dir) if f[-4:] == '.wav']

In [10]:
# IMPORT DATA, PAD IT, ARRANGE INTO TENSOR

from scipy.io import wavfile
import numpy as np

wavs = []
for f in file_names:
    _, wav = wavfile.read(join(data_dir, f))
    wavs.append(wav)
    
max_length = max([w.shape[0] for w in wavs])
# padded_length is least multiple of 128 greater
# than 512 containing max_length
padded_length = -128 * (-max_length // 128)


padded_wavs = []
for wav in wavs:
    pad_size = padded_length - wav.shape[0]
    left_pad = pad_size // 2
    right_pad = pad_size - left_pad
    padded_wavs.append(np.pad(wav, (left_pad, right_pad), mode='constant'))
    
X = np.stack(padded_wavs, axis=0)
Y = np.array([[int(f[:1]),] for f in file_names])

In [14]:
import tensorflow as tf
from tensorflow.keras import layers

BUFFER_SIZE = X.shape[0]
BATCH_SIZE = 128

train_dataset = tf.data.Dataset.from_tensor_slices(X).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [136]:
frame_length = 512
frame_step = 128
num_outputs = 4096

l = InverseSTFTLayer(frame_length, frame_step)

In [137]:
waveform = tf.random.normal(dtype=tf.float32, shape=[4096])
stft = tf.signal.stft(waveform, frame_length, frame_step, pad_end=False)
l(stft)

<tf.Tensor: shape=(4096,), dtype=float32, numpy=
array([ 0.0000000e+00, -2.5239810e-11,  1.5468522e-08, ...,
        1.6693395e-08,  1.4146067e-08, -6.2463862e-10], dtype=float32)>

In [142]:
speak(waveform)

In [141]:
speak(l(stft))

In [143]:
speak(waveform - l(stft))

In [128]:
frame_length = 512
frame_step = 128
waveform = tf.random.normal(dtype=tf.float32, shape=[4096])
stft = tf.signal.stft(waveform, frame_length, frame_step, pad_end=False)
inverse_stft = tf.signal.inverse_stft(
    stft, frame_length, frame_step,
    window_fn=tf.signal.inverse_stft_window_fn(frame_step)
)

In [129]:
stft.shape

TensorShape([29, 257])

In [118]:
speak(waveform)

In [119]:
out = tf.where(tf.math.is_nan(inverse_stft), tf.zeros_like(inverse_stft), inverse_stft)

In [120]:
speak(out)

In [121]:
waveform

<tf.Tensor: shape=(4096,), dtype=float32, numpy=
array([-1.6118259e+00, -1.3911081e-03,  2.6760474e-01, ...,
        5.9484679e-01, -5.8726847e-01,  1.2521585e+00], dtype=float32)>

In [122]:
inverse_stft

<tf.Tensor: shape=(4096,), dtype=float32, numpy=
array([ 0.0000000e+00, -5.2052673e-10,  1.5511622e-06, ...,
        1.7421644e-05, -3.4044533e-06,  4.5438395e-07], dtype=float32)>

In [123]:
speak(out - waveform)

In [92]:
speak(out[:1024] - waveform)

In [73]:
speak(tf.pad(inverse_stft, tf.constant([[0,1024 - inverse_stft.shape[0]]]))[1:] - waveform[1:])

In [134]:
class InverseSTFTLayer(tf.keras.layers.Layer):
    def __init__(self, frame_length, frame_step):
        super(InverseSTFTLayer, self).__init__()
        self.frame_length = frame_length
        self.frame_step = frame_step
        #self.num_frames = 1 + (num_outputs - frame_length) // frame_step
        #self.input_shape = (self.num_frames, 1 + frame_length // 2)


    def build(self, input_shape):
        self.num_frames = input_shape[0]
        self.num_outputs = self.frame_length + (self.num_frames - 1) * self.frame_step

    def call(self, input):
        inverse_stft = tf.signal.inverse_stft(
            input, self.frame_length, self.frame_step,
            window_fn=tf.signal.inverse_stft_window_fn(self.frame_step)
        )
        return inverse_stft

In [None]:
def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(49*256, use_bias=False, input_shape=(100,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Reshape((49, 256)))
    assert model.output_shape == (None, 49, 256) # Note: None is the batch size

    model.add(layers.Conv1D(128, 5, strides=1, padding='same', use_bias=False))
    assert model.output_shape == (None, 49, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv1D(64, 5, strides=2, padding='same', use_bias=False))
    assert model.output_shape == (None, 196, 64)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv1D(1, 5, strides=2, padding='same', use_bias=False, activation='tanh'))
    assert model.output_shape == (None, 28, 28, 1)
    
    return model

In [19]:
from IPython.display import Audio
def speak(sample):
    return Audio(sample, rate=8000, autoplay=True)

In [None]:
speak(lengths[0])