# Discrete frame-level variational autoenoders

Ryan Eloff, ryan.peter.eloff@gmail.com, February 2019.

Based on code from the following paper:
- H. Kamper, "Truly unsupervised acoustic word embeddings using weak top-down constraints in encoder-decoder models," *arXiv preprint arXiv:1811.00403*, 2018. [[arXiv](https://arxiv.org/abs/1811.00403)][[code](https://github.com/kamperh/recipe_bucktsong_awe)]

### Preamble

Re-load all modules before executing Python cells, and use the matplotlib 'inline' backend:

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Imports

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import os
import sys

In [4]:
import numpy as np
import tensorflow as tf

In [5]:
sys.path.append(os.path.join("..", "src"))
sys.path.append(os.path.join("..", "embeddings"))

In [6]:
from tf_models import ae
from tf_models import rnn
from tf_models import TF_FLOAT_DTYPE
from tf_models import TF_INT_DTYPE

In [7]:
output_dir = "tmp"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Load data

In [8]:
# TODO(rpeloff) buckeye from zerospeech2015, and/or english data from zerospeech2019? Extracted on sheldon?

# ...
dim_frame = 13

### Many-to-many sequence-to-sequence model with frame-level latent transformation (autoencoder)

In [9]:
# TODO(rpeloff): should rename this to `def build_the_thing_with_to_many_words(...)`
def build_multi_layer_many_to_many_with_latent(
        x_input, x_lengths, encoder_hidden_units, decoder_hidden_units, latent_transform, latent_kwargs,
        rnn_cell="lstm", rnn_cell_kwargs=None, keep_prob=1., decoder_conditioning=None):
    """Build a multi-layer many-to-many sequence-to-sequence model with per-frame latent layers.
    
    Note: The decoders output is processed by a linear transformation such that the final
    output dimensionality matches that of the input `x_input`. 
    
    TODO(rpeloff): function doc
    
    `decoder_conditioning`: 
        Optionally condition the decoder on the concatenation of the latent representation and this tensor.
    """
    max_length = tf.math.reduce_max(x_lengths)  # compute maximum along the lengths tensor
    x_tensor = tf.convert_to_tensor(x_input)
    dim_features = x_tensor.get_shape().as_list()[-1]  # get input features dimensionality

    # Multi-layer RNN encoder
    # -----------------------
    encoder_output, encoder_states = rnn.build_multi_layer_rnn(
        x_tensor, x_lengths, encoder_hidden_units, rnn_cell, rnn_cell_kwargs, keep_prob, scope="rnn_encoder")
    
    # Latent transformation
    # ---------------------
    encoder_output_flattened = tf.reshape(encoder_output, [-1, encoder_hidden_units[-1]])
    latent_output = latent_transform(encoder_output_flattened, **latent_kwargs)
    recon_output = latent_output["y"]  # latent reconstruction output
    
    dim_recon_output = recon_output.get_shape().as_list()[-1]  # get latent layer dimensionality
    recon_output = tf.reshape(recon_output, [-1, max_length, dim_recon_output])  # reshaped
    
    # Additional decoder conditioning
    # -------------------------------
    if decoder_conditioning is not None:
        dim_conditioning_tensor = decoder_conditioning.get_shape().as_list()[-1]
        tiled_conditioning = tf.reshape(  # tile conditioning tensor to max_length
            tf.tile(decoder_conditioning, [1, max_length]), [-1, max_length, dim_conditioning_tensor])
        decoder_input = tf.concat([recon_output, tiled_conditioning], axis=-1)
    else:
        decoder_input = recon_output
        
    # Multi-layer RNN decoder
    # -----------------------
    decoder_output, decoder_states = rnn.build_multi_layer_rnn(
        decoder_input, x_lengths, decoder_hidden_units, rnn_cell, rnn_cell_kwargs, keep_prob, scope="rnn_decoder")
    
    mask = tf.sign(tf.reduce_max(tf.abs(decoder_output), 2))
    
    # Final linear transformation
    # ---------------------------
    with tf.variable_scope("rnn_decoder/linear_output"):
        decoder_output = tf.reshape(decoder_output, [-1, decoder_hidden_units[-1]])
        decoder_output = tf.keras.layers.Dense(units=dim_features)(decoder_output)
        decoder_output = tf.reshape(decoder_output, [-1, max_length, dim_features])
        decoder_output *= tf.expand_dims(mask, -1)

    return {
        "encoder_output": encoder_output,
        "latent_output": latent_output, 
        "decoder_output": decoder_output,
        "mask": mask,
        "max_length": max_length}


### Train with vanilla autoencoder

In [10]:
tf.reset_default_graph()

# Training parameters
# -------------------
learning_rate = 0.001
n_epochs = 20
n_val_interval = 1
batch_size = 300
n_buckets = 3

# Model parameters
# ----------------
dim_features = dim_frame
encoder_hidden_units = [10]
decoder_hidden_units = [10]  # [400 + d_speaker_embedding, 400, 400]
latent_transform = ae.build_autoencoder
latent_kwargs = {
    "encoder_hidden_units": [],
    "z_units": 130,
    "decoder_hidden_units": [],  # [decoder_hidden_units[0]],
    "activation": "relu"
    }
rnn_type = "gru"  # "lstm", "gru" or "rnn"


# Many-to-many autoencoder model
# ------------------------------
x_input = tf.placeholder(TF_FLOAT_DTYPE, [None, None, dim_features], name="x_input")
x_lengths = tf.placeholder(TF_INT_DTYPE, [None], name="x_lengths")

network_dict = build_multi_layer_many_to_many_with_latent(
    x_input, x_lengths, encoder_hidden_units, decoder_hidden_units, latent_transform, latent_kwargs,
    rnn_cell="lstm", rnn_cell_kwargs=None, keep_prob=1., decoder_conditioning=None)

encoder_output = network_dict["encoder_output"]
max_length = network_dict["max_length"]
latent_output = network_dict["latent_output"]
z_latent = tf.reshape(latent_output["z"], [-1, max_length, latent_kwargs["z_units"]])
decoder_output = network_dict["decoder_output"]
mask = network_dict["mask"]

decoder_output *= tf.expand_dims(mask, -1)  # @hkamper: why ???

# Reconstruction loss
# https://danijar.com/variable-sequence-lengths-in-tensorflow/
loss = tf.reduce_mean(
    tf.reduce_sum(tf.reduce_mean(tf.square(x_input - decoder_output), -1), -1) / tf.reduce_sum(mask, 1))

optimiser = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Model storage
# -------------
intermediate_model_file = os.path.join(output_dir, "many_to_many_ae.tmp.ckpt")
model_file = os.path.join(output_dir, "many_to_many_ae.ckpt")


In [11]:
from IPython.display import clear_output, Image, display, HTML
import numpy as np    

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [12]:
show_graph(tf.get_default_graph())