# Set Up Imports

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Conv2D, Conv2DTranspose, Reshape, Dense, Input, Flatten, Lambda
from keras.losses import binary_crossentropy
from keras import Model
from keras import backend as K
import numpy as np
from tqdm import tqdm
print(f"Using Tensorflow v{tf.__version__}")

Using Tensorflow v2.10.0


# Encoder

In [5]:
#  Set up the encoder input
encoder_input_shape = (1,256,1)
encoder_input = Input(shape=encoder_input_shape, name="encoder_input")
# First convolution layers
encoder_conv2D_1 = Conv2D(
    filters=16,
    kernel_size=(4,1),
    strides=(1,2),
    padding="same",
    activation="relu",
    name="encoder_conv2d_1"
)(encoder_input)

# Second convolution layer
encoder_conv2D_2 = Conv2D(
    filters=16,
    kernel_size=(4,1),
    strides=(1,2),
    padding="same",
    activation="relu",
    name="encoder_conv2d_2"
)(encoder_conv2D_1)

# Reshape 
encoder_output = Reshape((1,-1), name="encoder_reshape")(encoder_conv2D_2)

# Dense
# encoder_output = Dense(16, name="mu")(encoder_output)

encoder = Model(encoder_input, encoder_output, name="encoder")

# Decoder

In [4]:
decoder_input_shape = (1,16,1)
decoder_input = Input(shape=decoder_input_shape, name="decoder_input")
decoder_dense = Dense(128, name='decoder_dense_1')(decoder_input)
decoder_reshape = Reshape((1,64, -1), name="decoder_reshape")(decoder_dense)
decoder_conv2dT_1 = Conv2DTranspose(
    filters=16,
    kernel_size=(4,1),
    strides=(1,2),
    padding="same",
    activation="relu",
    name="decoder_conv2dT_1"
)(decoder_reshape)
decoder_output = Conv2DTranspose(
    filters=1,
    kernel_size=(4,1),
    strides=(1,2),
    padding="same",
    activation="sigmoid",
    name="decoder_output"
)(decoder_conv2dT_1)
decoder = Model(decoder_input, decoder_output, name="decoder")



In [50]:
decoder.ev

<bound method Model._should_eval of <keras.engine.functional.Functional object at 0x28de36560>>

# Variational Autoencoder

In [33]:
class VAE(Model):
    wave_size = 256
    variational_beta = 0.9
    def __init__(self, encoder, decoder, encoder_input, **kwargs):
        super().__init__(**kwargs)
        self.training = True
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.recon_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
    
    def summary(self):
        self.encoder.summary()
        self.decoder.summary()
    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.recon_loss_tracker,
            self.kl_loss_tracker,
        ]
    def sample_normal_point(self, mu, log_var):
        if self.training:
            epsilon = K.random_normal(shape=K.shape(mu), mean=0., stddev=1.)
            sampled_point = mu + K.exp(log_var/2) * epsilon
            return sampled_point
        else:
            return mu

    def train_step(self, data):
        with tf.GradientTape() as tape:
            # Generate the reconstructed data
            self.mu =  Dense(16, name="mu")(self.encoder(data))
            self.log_var = Dense(16, name="log_var")(self.encoder(data))
            sampled_point = self.sample_normal_point(self.mu, self.log_var)
            recon_x = self.decoder(sampled_point)
            # Generate the reconstruction loss
            recon_loss = binary_crossentropy(tf.reshape(data, (-1,self.wave_size)), 
                                                        tf.reshape(recon_x, (-1,self.wave_size)))
            recon_loss = tf.reduce_mean(tf.reduce_sum(recon_loss))
            
            # Generate the kl divergence
            kl_loss = -0.5 * (1 + self.log_var - tf.square(self.mu) - tf.exp(self.log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss))

            # Total variation loss
            tv_loss =  tf.reduce_sum(tf.square(recon_x[:,:,:-1] - recon_x[:,:,1:]))

            # Calculate the total loss
            total_loss = recon_loss + self.variational_beta * kl_loss + 100.01 * tv_loss

        # Generate the gradients
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.recon_loss_tracker.update_state(recon_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        
        return {
            "total_loss": self.total_loss_tracker.result(),
            "recon_loss": self.recon_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result()
        }

# Load Dataset and Normalize

In [20]:
def normalize(tensor):
    tensor_minusmin = tensor - tensor.min()
    tensor_maxminusmin = tensor.max() - tensor.min()
    return tensor_minusmin / tensor_maxminusmin

def get_waves(file_name):
    # Read the raw audio from the .WAV file
    raw_audio = tf.io.read_file(filename=file_name)
    # Convert the raw audio to a waveform
    wave_bank, sample_rate = tf.audio.decode_wav(raw_audio)
    # Display the wavebank and sample_rate
    # display(wave_bank, sample_rate)
    wave_size = 256
    num_waves = 0
    waves = []
    wave_forms = np.transpose(wave_bank.numpy())
    for i in range(64):
        wave_form = wave_forms[0, i*wave_size:i*wave_size+wave_size]
        if wave_form.max() != wave_form.min():
            wave_form = normalize(wave_form)
            waves.append(wave_form)
            num_waves += 1    
    waves = tf.stack(waves)
    return waves

In [21]:
waves = get_waves("./audio_data/ENVELO01.WAV")
display(waves.shape)
waves = tf.expand_dims(waves,1)
waves = tf.expand_dims(waves,1)
waves = tf.expand_dims(waves,-1)
display(waves.shape)

TensorShape([64, 256])

TensorShape([64, 1, 1, 256, 1])

# Initialize the VAE

In [34]:
vae = VAE(encoder=encoder, decoder=decoder, encoder_input=encoder_input)
vae.compile(optimizer=keras.optimizers.Adam())
vae.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_input (InputLayer)  [(None, 1, 256, 1)]       0         
                                                                 
 encoder_conv2d_1 (Conv2D)   (None, 1, 128, 16)        80        
                                                                 
 encoder_conv2d_2 (Conv2D)   (None, 1, 64, 16)         1040      
                                                                 
 encoder_reshape (Reshape)   (None, 1, 1024)           0         
                                                                 
Total params: 1,120
Trainable params: 1,120
Non-trainable params: 0
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 decoder_input (InputLayer)  [(None, 1, 

# VAE training

In [51]:
num_epochs = 10
for j in tqdm(range(num_epochs)):
    for i in waves:
        loss = vae.train_step(i)

100%|██████████| 10/10 [00:33<00:00,  3.39s/it]


# Save the model

In [52]:
vae.decoder.save("./keras_vae.h5")

