In [None]:
import numpy as np
import keras
import tensorflow as tf

tf.enable_eager_execution()

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda
from tensorflow import set_random_seed

### Definition of model
Define your model here, e.g. the code below is the VAE model

In [None]:
set_random_seed(0)

def sampling(args):
    z_mean, z_logsigma = args
    tf.set_random_seed(0)
    epsilon = tf.random_normal(shape = tf.shape(z_mean))
    sampled_vector = tf.add(z_mean, tf.multiply(tf.exp(.5 * z_logsigma), epsilon))
    return sampled_vector

def total_vae_loss (x, x_pred, mu, logsigma, kl_weight =5e-3):
    kl_loss = 0.5 * tf.reduce_sum(tf.exp(logsigma) + tf.square(mu) - 1 - logsigma, axis = 1)
    reconstruction_loss = tf.reduce_mean((x - x_pred)**2)
    total_vae_loss = kl_weight * kl_loss + reconstruction_loss
    
    losses = {'kl_loss': kl_loss,
              'rc_loss': reconstruction_loss,
              'total_vae_loss': total_vae_loss}
    return losses

inputs = Input(shape = n_x, batch_size = batch_size)
a_1 = Dense(units = l_1, activation = 'relu')(inputs) 
z_mean = Dense(units = n_z)(a_1)
z_logsigma = Dense(units = n_z)(a_1)
sampled_vector = Lambda(sampling)([z_mean, z_logsigma])
z_2 = Dense(units = n_x)(sampled_vector)
model = Model(inputs = inputs, outputs = [z_2, z_mean, z_logsigma])

print(model.summary())

Let the model train once (1st epoch, 1st batch).

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate = alpha) # Initialize optimizer
tf.set_random_seed(0)
with tf.GradientTape() as tape:
    z_2, z_mean, z_logsigma = model(train_inputs) # Forward pass
    losses = total_vae_loss(train_inputs, z_2, z_mean, z_logsigma) # Compute loss
    grads = tape.gradient(losses['total_vae_loss'], model.weights) # Calculate gradient against model.weights 
                                                                   # model.weight means trainable parameters here.
    optimizer.apply_gradients(zip(grads, model.weights)) # Apply the gradient updates to the trainable params.

In [None]:
# Obtain the gradient of the parameters of your last layer
grad_last_bias = grads[len(grads)-1]

# Obtain the moments (m and v) that is part of the ADAM schedule:
def first_moment_update(previous_moment, grad, beta, timestep):
    biased = beta * previous_moment + (1 - beta) * grad
    unbiased = biased / (1 - np.power(beta, timestep))
    return unbiased

def second_moment_update(previous_moment, grad, beta, timestep):
    biased = beta * previous_moment + (1 - beta) * np.square(grad)
    unbiased = biased / (1 - np.power(beta,timestep))
    return unbiased

# first_moment_update returns m, second_moment_update returns v. The values for beta are default values recommended
# in the paper, same as implemented in tf.
first_moment_update(0, grad_last_bias, 0.9,1), second_moment_update(0, grad_last_bias, 0.999,1)

In [None]:
# The m and v that tensorflow thinks is correct is here:
optimizer.get_slot(model.trainable_variables[len(grads)-1],'m'), \
optimizer.get_slot(model.trainable_variables[len(grads)-1],'v')

In [None]:
# Are the manual calculations for m and v the same as what tensorflow thinks it is?