In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
import pandas as pd

from keras.models import Model
from keras.layers import Input, Dense, Flatten, Reshape, Layer, TimeDistributed, Concatenate, Lambda
from keras.datasets import mnist
from keras.utils import to_categorical

import keras.backend as K

from vae_utils import *

Using TensorFlow backend.


In [3]:
os.chdir('./data/iwae/')

# Data

In [4]:
def preprocess(data):
    x, y = data
    x = x.reshape((len(x), 28, 28))
    x = x/255.
    #y = to_categorical(y, 10) #don't need to categorise y
    return x, y

train, test = mnist.load_data()

x_train, y_train = preprocess(train)
x_test, y_test = preprocess(test)

print(x_train.shape)

(60000, 28, 28)


# Custom Layers

In [5]:
# we need a layer that samples a latent variable given a mean and standard deviation

class Sampler(Layer):
    
    def __init__(self, always_sample=False, **kwargs):
        self.stddev = 1
        self.always_sample = always_sample
        super(Sampler, self).__init__(**kwargs)
    
    def call(self, x, training=None):
        assert isinstance(x, list)
        z_mean, log_z_var = x
        z_std = K.exp(log_z_var/2)
        
        # sample epsilon from N(0, stddev)
        shape = K.shape(z_std)
        epsilon = K.random_normal(shape, mean=0, stddev=self.stddev)
        z_sample = z_mean + z_std * epsilon
        
        if self.always_sample:
            return z_sample
        else:
            return K.in_train_phase(z_sample, z_mean, training=training)
    
    def compute_output_shape(self, input_shape):
        assert isinstance(input_shape, list)
        assert input_shape[0] == input_shape[1]
        return input_shape[0]

# Model 1

In [6]:
def model_1(k, latent_dim=100, hidden_dim=200, always_sample=False):
    """Define model 1 as the single stochastic model in [1]
    
    k: int
        Number of k samples in the IWAE model, or the number of MC samples
        for the VAE model.
        
    latent_dim: int, optional
        Dimensionality of the latent space, default = 100.
        
    hidden_dim: int, optional
        Number of hidden units for the dense layers.
        
    always_sample: boolean, optional
        Whether to always sample from the posterior distribution, or only
        during training. Default = False.
        
    References
    ----------
    
    [1] Burda Y, Grosse R, Salakhutdinov R. Importance weighted autoencoders. 
        arXiv preprint arXiv:1509.00519. 2015 Sep 1.
    """
    # encoder shared layers
    enc_hid_1 = Dense(hidden_dim, activation='tanh', name='enc_1_hidden_1')
    enc_hid_2 = Dense(hidden_dim, activation='tanh', name='enc_1_hidden_2')
    z_mean = Dense(latent_dim, name='enc_1_latent_mean')
    log_z_var = Dense(latent_dim, name='enc_1_log_latent_var')
    sampler = Sampler(always_sample, name='z1_sampler')
    
    # decoder shared layers
    dec_hid_1 = Dense(hidden_dim, activation='tanh', name='dec_1_hidden_1')
    dec_hid_2 = Dense(hidden_dim, activation='tanh', name='dec_1_hidden_2')
    bernoulli_mean = Dense(28*28, activation='sigmoid', name='dec_1_mean')
    reshape = Reshape((28, 28), name='dec_1_output')
    
    # single pass model
    x = Input(shape=(28, 28), name='enc_1_input')
    y = Flatten(name='enc_1_flatten')(x)
    y = enc_hid_1(y)
    y = enc_hid_2(y)
    mu = z_mean(y)
    log_var = log_z_var(y)
    z1 = sampler([z_mean(y), log_z_var(y)])
    y = dec_hid_1(z1)
    y = dec_hid_2(y)
    y = bernoulli_mean(y)
    y = reshape(y)
    
    model = Model(x, y, name='model_1')
    
    # k forward passes - start from first sampling layer
    k_z1 = [sampler([mu, log_var]) for i in range(k)]
    k_y = [dec_hid_1(z1) for z1 in k_z1]
    k_y = [dec_hid_2(y) for y in k_y]
    k_y = [bernoulli_mean(y) for y in k_y]
    k_y = [reshape(y) for y in k_y]
    
    return model, mu, log_var, k_z1, k_y

## Training model 1

### Model 1 trainer

In [7]:
def train_model_1(k, latent_dim=100, epochs=50, batch_size=512, train='both'):
    """Trains the first model defined in [1].
    
    Parameters
    ----------
    
    k: int
        Number of forward passes for the IWAE model, and number of MC samples
        in the VAE model
        
    latent_dim: int, optional
        Dimensionality of the latent space, default = 100.
        
    epochs: int, optional
        Number of epochs to train over, default=50.
        
    batch_size: int, optional
        Batch size of training sample, default=512.
    
    train: string, optional
        Indicates which model to train, either 'iwae', 'vae', or 'both'. Default = 'both'
        
    Returns
    -------
    
    model: keras model or tuple
        If train is either 'iwae' or 'vae', returns a single trained keras model. Otherwise
        returns a tuple of keras models, where model=(iwae_model, vae_model).
        
    hist: keras history or tuple
        If train is either 'iwae' or 'vae', returns a single history object containing the
        training history of the model. Otherwise returns a tuple of history objects.
        
    References
    ----------
    
    [1] Burda Y, Grosse R, Salakhutdinov R. Importance weighted autoencoders. 
        arXiv preprint arXiv:1509.00519. 2015 Sep 1.
    """
    out = []
    hists = []
    
    if train.lower() == 'iwae' or train.lower() == 'both':
        # IWAE training
        print('Training IWAE model')
        print('-------------------')

        iwae_model, z_mean, log_z_var, z1_samples, out_samples = model_1(k, latent_dim)
        
        def iwae_loss(y_true, y_pred):
            # calculate log distributions (log_p_x_y := p(x|y))
            log_weights = []
            for i in range(k):
                log_q_h1_x = -0.5 * K.sum(log_z_var + K.exp(-log_z_var)*(z1_samples[i] - z_mean)**2, axis=-1)
                log_p_h1 = -0.5 * K.sum(K.square(z1_samples[i]), axis=-1)
                log_p_x_h1 = -K.sum(K.binary_crossentropy(y_true, out_samples[i]), axis=(-1, -2))

                # calculate weights
                log_weight = log_p_x_h1 + log_p_h1 - log_q_h1_x
                log_weights.append(log_weight)
                
            max_log_weights = K.max(log_weights, axis=0, keepdims=True) # for numerical stability
            weights = K.exp(log_weights - max_log_weights)
            elbo = 1/k * K.sum(weights, axis=0)
            elbo = K.log(elbo) + max_log_weights
            
            # uncomment to add second order term to the elbo
            #v_sqr = K.square(log_weights)
            #v_sqr = 1/k * K.sum(v_sqr, axis=0)
            #second_ord = -1/2 * (v_sqr - elbo**2)
            #elbo = elbo + second_ord
            
            loss = -elbo

            return loss
        
        iwae_model.compile(optimizer='adam', loss=iwae_loss)
        hist = iwae_model.fit(x_train, x_train, batch_size=batch_size, epochs=epochs)
        
        out.append(iwae_model)
        hists.append(hist)

        model_path = './iwae_model_1_k_%d_dim_%d.weights' %(k, latent_dim)
        iwae_model.save_weights(model_path)
        
        
    if train.lower() == 'vae' or train.lower() == 'both':
        # VAE training
        if train.lower() == 'both':
            print('\n')
        print('Training VAE model')
        print('------------------')
        
        vae_model, z_mean, log_z_var, z1_samples, out_samples = model_1(k, latent_dim)
        
        def vae_loss(y_true, y_pred):
            loss = 0
            elbos = []
            for i in range(k):
                log_q_h1_x = -0.5 * K.sum(log_z_var + K.exp(-log_z_var)*(z1_samples[i] - z_mean)**2, axis=-1)
                log_p_h1 = -0.5 * K.sum(K.square(z1_samples[i]), axis=-1)
                log_p_x_h1 = -K.sum(K.binary_crossentropy(y_true, out_samples[i]), axis=(-1, -2))

                elbo = log_p_x_h1 + log_p_h1 - log_q_h1_x
                elbos.append(elbo)
                loss -= elbo
            
            loss = loss / k
            
            # uncomment to see what happens when we add the second order term to the vae loss
            #elbo = 1/k * K.sum(elbos, axis=0)
            #v_sqr = K.square(elbos)
            #v_sqr = 1/k * K.sum(v_sqr, axis=0)
            #second_ord = -1/2 * (v_sqr - elbo**2)
            #elbo = elbo + second_ord
            
            #loss = -elbo
            
            return loss

        vae_model.compile(optimizer='adam', loss=vae_loss)
        hist = vae_model.fit(x_train, x_train, batch_size=batch_size, epochs=epochs)
        
        out.append(vae_model)
        hists.append(hist)

        model_path = './vae_model_1_k_%d_dim_%d.weights' %(k, latent_dim)
        vae_model.save_weights(model_path)
    
    
    if train.lower() not in ['both', 'vae', 'iwae']:
        print('Set parameter train to "both", "vae", or "iwae".')
        return
    
    if train.lower() == 'both':
        return out, hists
    return out[0], hists[0]

In [8]:
# batch trainer

def train_for_ks(ks, latent_dim=100, train='both'):
    """Trains a model for each k in a list of ks."""
    ks = np.asarray(ks)
    print('TRAINING')
    for k in ks:
        print('\n')
        print('-------------------')
        print('k = %d' %k)
        print('latent_dim = %d' %latent_dim)
        print('-------------------\n')
        train_model_1(k, latent_dim, train=train)
        print('\n\n')

In [9]:
#ks = [1, 5, 10, 20, 30, 40, 50]
ks = [1, 5, 10, 20, 30, 40, 50]

In [10]:
# uncomment to train all models (will take long!)
#train_for_ks(ks, train='iwae')

## Testing Model 1

### IWAE

In [15]:
# load the iwae model and get relevant predictions

k = 100

load_k = 50
latent_dim = 100

iwae_path = 'iwae_model_1_k_%d_dim_%d.weights' %(load_k, latent_dim)
    
# load model
iwae_model, z_mean, log_z_var, z1_samples, out_samples = model_1(k, latent_dim, always_sample=True)
iwae_model.load_weights(iwae_path, by_name=True)

k_model = Model(iwae_model.input, [*out_samples, *z1_samples])
enc_model = Model(iwae_model.input, [z_mean, log_z_var])

#iwae_model.compile(optimizer='adam', loss=iwae_loss)

k_outs = k_model.predict(x_test, batch_size=512)
x_preds, z1_samples = k_outs[:k], k_outs[k:]
z_mean, log_z_var = enc_model.predict(x_test, batch_size=512)

In [16]:
def iwae_elbo(y_true, y_pred):
    # calculate log distributions (log_p_x_y := p(x|y))
    elbo = []
    log_weights = []
    for i in range(k):
        log_q_h1_x = -0.5 * np.sum(log_z_var + np.exp(-log_z_var)*(z1_samples[i] - z_mean)**2, axis=-1)
        log_p_h1 = -0.5 * np.sum(z1_samples[i]**2, axis=-1)
        log_p_x_h1 = np.sum(y_true * np.log(y_pred[i]) + (1 - y_true) * np.log(1 - y_pred[i]), axis=(-1, -2))

        # calculate weights
        log_weight = log_p_x_h1 + log_p_h1 - log_q_h1_x
        log_weights.append(log_weight)
    
    weights = np.exp(log_weights - np.max(log_weights, axis=1, keepdims=True))
    weights = weights / sum(weights, axis=0)
    var = np.var(log_weights, axis=0, ddof=1)
    elbo = np.sum(weights * log_weights, axis=0)
    
    #max_log_weights = np.max(log_weights, axis=0, keepdims=True)
    #weights = np.exp(log_weights - max_log_weights)
    #elbo = 1/k * np.sum(weights, axis=0)
    #elbo = np.log(elbo) + max_log_weights
    #loss = -elbo

    return elbo, var

In [17]:
elbo, var = iwae_elbo(x_test, x_preds)

In [18]:
elbo, mean(elbo), var, mean(var)

(array([ -74.74732825, -117.35784069,  -49.16803788, ...,  -83.09872852,
        -110.29914138, -134.28125874]),
 -98.95751206189958,
 array([ 47.08803697, 210.73754853,  32.39302402, ...,  33.69196127,
         89.72099894, 156.11066039]),
 95.85488116086499)

In [16]:
# number of active units
A_u = np.var(z_mean, axis=0)
sum(log(A_u) >= -2)

19

### VAE

In [19]:
# load the iwae model and get relevant predictions

k = 100

load_k = 50
latent_dim = 100

vae_path = 'vae_model_1_k_%d_dim_%d.weights' %(load_k, latent_dim)
    
# load model
vae_model, z_mean, log_z_var, z1_samples, out_samples = model_1(k, latent_dim, always_sample=True)
vae_model.load_weights(vae_path, by_name=True)

k_model = Model(vae_model.input, [*out_samples, *z1_samples])
enc_model = Model(vae_model.input, [z_mean, log_z_var])

#vae_model.compile(optimizer='adam', loss=vae_loss)

k_outs = k_model.predict(x_test, batch_size=512)
x_preds, z1_samples = k_outs[:k], k_outs[k:]
z_mean, log_z_var = enc_model.predict(x_test, batch_size=512)

In [20]:
def vae_elbo(y_true, y_pred):
    # calculate log distributions (log_p_x_y := p(x|y))
    elbos = []
    for i in range(k):
        log_q_h1_x = -0.5 * np.sum(log_z_var + np.exp(-log_z_var)*(z1_samples[i] - z_mean)**2, axis=-1)
        log_p_h1 = -0.5 * np.sum(z1_samples[i]**2, axis=-1)
        log_p_x_h1 = np.sum(y_true * np.log(y_pred[i]) + (1 - y_true) * np.log(1 - y_pred[i]), axis=(-1, -2))
        
        elbos.append(log_p_x_h1 + log_p_h1 - log_q_h1_x)
    
    var = np.var(elbos, axis=0, ddof=1)
    elbo = np.mean(elbos, axis=0)

    return elbo, var

In [21]:
elbo, var = vae_elbo(x_test, x_preds)

In [22]:
elbo, mean(elbo), var, mean(var)

(array([ -78.95494078, -118.90137086,  -44.69199043, ...,  -88.82829875,
        -115.88724153, -129.71096545]),
 -103.1800145100259,
 array([ 4.00670403,  9.42277608,  0.57595498, ..., 14.54460937,
         9.90565738,  4.18842899]),
 9.099651940002172)

In [26]:
# number of active units
A_u = np.var(z_mean, axis=0)
sum(log(A_u) >= -2)

12