<a href="https://colab.research.google.com/github/johannnamr/Discrepancy-based-inference-using-QMC/blob/main/Inference/VAE/vae_sample_complexity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample complexity for a VAE using MC and RQMC

Notebook calculating the sample complexity for a VAE using MC and RQMC

## Mount drive

In [None]:
# mount my drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%run "/content/drive/My Drive/Colab Notebooks/ot_slicedW.ipynb"

Set path for saving the results (adjust if necessary):

In [None]:
path = '/content/drive/My Drive/Colab Notebooks/Paper/Inference/'
path_samples = '/content/drive/My Drive/Colab Notebooks/Paper/Inference/VAE_samples/'

## Imports

In [None]:
! pip install tensorflow==1.15.0
! pip install --upgrade scipy # update scipy to latest version

Collecting tensorflow==1.15.0
  Downloading tensorflow-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl (412.3 MB)
[K     |████████████████████████████████| 412.3 MB 23 kB/s 
[?25hCollecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Collecting tensorflow-estimator==1.15.1
  Downloading tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 70.0 MB/s 
Collecting keras-applications>=1.0.8
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 8.6 MB/s 
[?25hCollecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 48.4 MB/s 
Building wheels for collected packages: gast
  Building wheel for gast (setup.py) ... [?25l[?25hdone
  Created wheel for gast: filename=gast-0.2.2-py3-none-any.whl size=7554 sha256=3b2bd0540eecbad544251e5c2b9d6635ffb6b9470a7181cf6382058ecd7de8d4
  Stored in dir

In [None]:
import numpy as np
import tensorflow as tf
from scipy.stats import qmc # QMC points
import scipy.spatial.distance as distance # distance used for kernel
import matplotlib.pyplot as plt

## Parameters

Set parameters:

In [None]:
n = [ 2**j for j in range(4,15) ]         # sample size  
num = 25                                  # numbers of repetitions for MC and RQMC
divergence = 'mmd'                         # sink: Sinkhorn, w: Wasserstein, sw: sliced Wasserstein, mmd = squared MMD
runsim = False                             # True: simulate samples, False: load saved samples
rundiv = True                             # True: calculate divergences, False: load saved divergences

In [None]:
if rundiv and divergence == 'w' or rundiv and divergence == 'sw' or rundiv and divergence == 'sink':
  #! pip install --upgrade pip # update pip to latest version
  ! pip install --upgrade numpy # update numpy to latest version
  ! pip install POT --quiet
  import ot # Wasserstein distance and Sinkhorn divergence

## Define useful functions

In [None]:
# Xavier initialisation

def init_xavier(n_in,n_out):
    '''Create a convolution filter variable with the specified name and shape,
    and initialize it using Xavier initialition.'''
    initializer = tf.contrib.layers.xavier_initializer()
    variable = tf.Variable(initializer(shape=[n_in,n_out]))
    return variable

In [None]:
# Sinkhorn divergence

def cost_mat(X,Y,N,M):
    XX = tf.reduce_sum(tf.multiply(X,X),axis=1)
    YY = tf.reduce_sum(tf.multiply(Y,Y),axis=1)
    C1 = tf.transpose(tf.reshape(tf.tile(XX,[M]),[M,N]))
    C2 = tf.reshape(tf.tile(YY,[N]),[N,M])
    C3 = tf.transpose(tf.matmul(Y,tf.transpose(X)))
    C = C1 + C2 - 2*C3;
    return C

def K_tild(u,v,C,N,M,epsilon):
    C_tild = C - tf.transpose(tf.reshape(tf.tile(u[:,0],[M]),[M,N])) - tf.reshape(tf.tile(v[:,0],[N]),[N,M])
    K_tild = tf.exp(-C_tild/epsilon)
    return K_tild

def sinkhorn_step_log(j,u,v,C, N,M,epsilon,Lambda = 1):
    mu = tf.cast(1/N, tf.float32)
    nu = tf.cast(1/M, tf.float32)
    Ku = tf.reshape( tf.reduce_sum(K_tild(u,v,C,N,M,epsilon),axis = 1) ,[N,1] )
    u = Lambda * ( epsilon*(tf.log(mu) - tf.log(Ku +10**(-6))) + u )
    Kv = tf.reshape( tf.reduce_sum(K_tild(u,v,C,N,M,epsilon),axis = 0), [M,1] )
    v = Lambda * ( epsilon*(tf.log(nu) - tf.log(Kv +10**(-6))) + v )
    j += 1
    return j,u,v,C,N,M,epsilon

def sinkhorn_loss(X,Y):
    epsilon = tf.constant(1.) # smoothing sinkhorn
    Lambda = tf.constant(1.) # unbalanced parameter
    k = tf.constant(50) # number of iterations for sinkhorn
    N = tf.shape(X)[0] # sample size from mu_theta
    M = tf.shape(Y)[0] # sample size from \hat nu
    D = tf.shape(Y)[1] # dimension of the obervation space
    C = cost_mat(X,Y,N,M)
    K = tf.exp(-C/epsilon)
    #sinkhorn iterations
    j0 = tf.constant(0)
    u0 = tf.zeros([N,1])
    v0 = tf.zeros([M,1])
    cond_iter = lambda j, u, v, C, N, M, epsilon: j < k
    j,u,v,C,N,M,epsilon = tf.while_loop(
    cond_iter, sinkhorn_step_log, loop_vars=[j0, u0, v0,C, N,M,epsilon])
    gamma_log = K_tild(u,v,C,N,M,epsilon)
    final_cost = tf.reduce_sum(gamma_log*C)
    return final_cost

## Define divergences for sample complexity experiments

In [None]:
# Wasserstein and sliced Wasserstein distance

def wasserstein_loss(X,Y):

    n = np.shape(X)[0]

    # equal weights
    a = np.ones((n,)) / n 
    b = np.ones((n,)) / n
    
    # MC and RQMC
    for r in range(num):
      M = ot.dist(X, Y, 'euclidean')
      M /= M.max()
      w = ot.emd2(a, b, M)

    return w

def sliced_wasserstein_loss(X,Y):

    n = np.shape(X)[0]

    # equal weights
    a = np.ones((n,)) / n 
    b = np.ones((n,)) / n
    
    sw = sliced_wasserstein_distance(X,Y, metric='euclidean', a=a, b=b, n_projections=100)

    return sw

In [None]:
# Sinkhorn divergence using the OT library

def sink_loss(X,Y):

  n = np.shape(X)[0]

  # equal weights
  a = np.ones((n,)) / n 
  b = np.ones((n,)) / n

  sink = ot.bregman.empirical_sinkhorn_divergence(X, Y, reg=1, a=a, b=b, metric='sqeuclidean',method='sinkhorn')

  return sink

In [None]:
# Gaussian kernel
def k(X,Y,l): 

    r = distance.cdist(X,Y,'sqeuclidean')
    K = np.exp(-(1/(2*l**2))*r)

    return K

# MMD^2
def mmd_loss(n,m,kxx,kxy,kyy):

    # first sum
    sum1 = np.sum(kxx)
    
    # second sum
    sum2 = np.sum(kxy)
    
    # third sum
    sum3 = np.sum(kyy)
    
    return (1/n**2)*sum1-(2/(m*n))*sum2+(1/m**2)*sum3

## Define variational autoencoder class 

In [None]:
# Variational Autoencoder class


class VariationalAutoencoder(object):
    
    def __init__(self, network_architecture, transfer_fct=tf.nn.softplus, 
                 learning_rate=0.001, batch_size=100):
        self.network_architecture = network_architecture
        self.transfer_fct = transfer_fct
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
        # tf Graph input
        self.x = tf.placeholder(tf.float32, [batch_size, network_architecture["n_input"]])
      
        # Create autoencoder network
        self._create_network()
        # Define loss function based variational upper-bound and 
        # corresponding optimizer
        self._create_loss_optimizer()
        
        # Initializing the tensor flow variables
        init = tf.global_variables_initializer()

        # Launch the session
        self.sess = tf.Session()
        self.sess.run(init)
    
    def _create_network(self):
        # Initialize autoencode network weights and biases
        self.network_weights = self._initialize_weights(**self.network_architecture)

        # Draw one sample z from uniform in latent space
        n_z = self.network_architecture["n_z"]
        self.z = tf.random_uniform((self.batch_size, n_z), dtype=tf.float32)
        
        # Use generator to determine mean of
        # Bernoulli distribution of reconstructed input
        self.x_reconstr =   self._generator_network(self.network_weights["weights_gener"],
                                    self.network_weights["biases_gener"])
    
    def _initialize_weights(self, n_hidden_gener_1,  n_hidden_gener_2, 
                            n_input, n_z):
        all_weights = dict()
        all_weights['weights_gener'] = {
            'h1': init_xavier(n_z, n_hidden_gener_1),
            'h2': init_xavier(n_hidden_gener_1, n_hidden_gener_2),
            'out_var': init_xavier(n_hidden_gener_2, n_input)}
        all_weights['biases_gener'] = {
            'b1': tf.Variable(tf.zeros([n_hidden_gener_1], dtype=tf.float32)),
            'b2': tf.Variable(tf.zeros([n_hidden_gener_2], dtype=tf.float32)),
            'out_var': tf.Variable(tf.zeros([n_input], dtype=tf.float32))}
        return all_weights        
   

    def _generator_network(self, weights, biases):
        # Generate probabilistic decoder (decoder network), which
        # maps points in latent space onto a Bernoulli distribution in data space.
        # The transformation is parametrized and can be learned.
        layer_1 = self.transfer_fct(tf.add(tf.matmul(self.z, weights['h1']), 
                                           biases['b1'])) 
        layer_2 = self.transfer_fct(tf.add(tf.matmul(layer_1, weights['h2']), 
                                           biases['b2'])) 
        x_reconstr = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['out_var']), 
                                 biases['out_var']))
        return x_reconstr
            
    def _create_loss_optimizer(self):
        # Sinkhorn loss
        self.cost = sinkhorn_loss(self.x, self.x_reconstr)   # average over batch
        # Use ADAM optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
        
    def partial_fit(self, X):
        """Train model based on mini-batch of input data.
        
        Return cost of mini-batch.
        """
        opt, cost = self.sess.run((self.optimizer, self.cost), 
                                  feed_dict={self.x: X})
        return cost    
   
    def generate(self, n, z_sample):
        """ Generate data by sampling from latent space.
        
        If z_mu is not None, data for this point in latent space is
        generated. Otherwise, z_mu is drawn from prior in latent 
        space.        
        """
        
        zz = tf.placeholder(tf.float32, [n, network_architecture["n_z"]])

        
        weights = self.network_weights["weights_gener"]
        biases = self.network_weights["biases_gener"]
        
        layer_1 = self.transfer_fct(tf.add(tf.matmul(zz, weights['h1']), 
                                           biases['b1'])) 
        layer_2 = self.transfer_fct(tf.add(tf.matmul(layer_1, weights['h2']), 
                                           biases['b2'])) 
        x_reconstr = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['out_var']), 
                                 biases['out_var']))
        
        return self.sess.run(x_reconstr,feed_dict={zz: np.reshape(z_sample,[n,network_architecture["n_z"]])})


## Define training function

## Initialise the model

In [None]:
# initialise the model
if runsim:
  network_architecture =     dict(n_hidden_gener_1=500, # 1st layer decoder neurons
         n_hidden_gener_2=500, # 2nd layer decoder neurons
         n_input=784, # MNIST data input (img shape: 28*28)
         n_z=2)  # dimensionality of latent space

  vae = VariationalAutoencoder(network_architecture, learning_rate=0.005, batch_size=300)

## Generate samples

Use `vae.generate()` with either MC or QMC samples as input

Sampling using RQMC:

In [None]:
# rqmc
if runsim:
  for rep in range(num):
    if rep % 5 == 0:
      print(rep)
    x_rqmc = np.zeros((np.max(n),784),dtype=np.float32)
    y_rqmc = np.zeros((np.max(n),784),dtype=np.float32)
    # x values
    sampler_x = qmc.Sobol(d=2, scramble=True)
    zx_rqmc = sampler_x.random(np.max(n))                
    x_rqmc[:,:] = vae.generate(np.max(n),zx_rqmc)
    np.savez(path_samples+"vae_xsamples_rqmc_%s.npz"%(rep),x_rqmc=x_rqmc)
    # y values
    sampler_y = qmc.Sobol(d=2, scramble=True)
    zy_rqmc = sampler_y.random(np.max(n))               
    y_rqmc[:,:] = vae.generate(np.max(n),zy_rqmc)
    np.savez(path_samples+"vae_ysamples_rqmc_%s.npz" %(rep),y_rqmc=y_rqmc)

In [None]:
if runsim==False:
  x_rqmc = np.zeros((num,np.max(n),784),dtype=np.float32)
  y_rqmc = np.zeros((num,np.max(n),784),dtype=np.float32)
  for rep in range(num):
    xsamples_rqmc = np.load(path_samples+"vae_xsamples_rqmc_%s.npz"%(rep))
    ysamples_rqmc = np.load(path_samples+"vae_ysamples_rqmc_%s.npz"%(rep))
    x_rqmc[rep,:,:] = xsamples_rqmc['x_rqmc']
    y_rqmc[rep,:,:] = ysamples_rqmc['y_rqmc']

Sampling using MC:

In [None]:
# mc
if runsim:
  for rep in range(num):
    if rep % 5 == 0:
      print(rep)
    x_mc = np.zeros((np.max(n),784),dtype=np.float32)
    y_mc = np.zeros((np.max(n),784),dtype=np.float32)
    # x values
    zx_mc = np.random.rand(np.max(n),2)           
    x_mc[:,:] = vae.generate(np.max(n),zx_mc)
    np.savez(path_samples+"vae_xsamples_mc_%s.npz" %(rep),x_mc=x_mc)
    # y values
    zy_mc = np.random.rand(np.max(n),2)            
    y_mc[:,:] = vae.generate(np.max(n),zy_mc)
    np.savez(path_samples+"vae_ysamples_mc_%s.npz"%(rep),y_mc=y_mc)

In [None]:
if runsim==False:
  x_mc = np.zeros((num,np.max(n),784),dtype=np.float32)
  y_mc = np.zeros((num,np.max(n),784),dtype=np.float32)
  for rep in range(num):
    xsamples_mc = np.load(path_samples+"vae_xsamples_mc_%s.npz"%(rep))
    ysamples_mc = np.load(path_samples+"vae_ysamples_mc_%s.npz"%(rep))
    x_mc[rep,:,:] = xsamples_mc['x_mc']
    y_mc[rep,:,:] = ysamples_mc['y_mc']

## Compute divergence

In [None]:
# RQMC

if rundiv:
  loss_rqmc = np.zeros((len(n),num))
  for rep in range(num):
    if rep % 5 == 0:
      print(rep)      
    for i in range(len(n)):
      # divergence
      if divergence == 'sink':
        loss_rqmc[i,rep] = sink_loss(x_rqmc[rep,:n[i],:], y_rqmc[rep,:n[i],:])
      if divergence == 'w':
        loss_rqmc[i,rep] = wasserstein_loss(x_rqmc[rep,:n[i],:], y_rqmc[rep,:n[i],:])
      if divergence == 'sw':
        loss_rqmc[i,rep] = sliced_wasserstein_loss(x_rqmc[rep,:n[i],:], y_rqmc[rep,:n[i],:])
      if divergence == 'mmd':
        loss_rqmc[i,rep] = mmd_loss(n[i],n[i],k(x_rqmc[rep,:n[i],:],x_rqmc[rep,:n[i],:],l=0.01),k(x_rqmc[rep,:n[i],:],y_rqmc[rep,:n[i],:],l=0.01),k(y_rqmc[rep,:n[i],:],y_rqmc[rep,:n[i],:],l=0.01))

  print("RQMC:")    
  print(np.mean(np.abs(loss_rqmc),axis=1))
  np.savez(path+"vae_loss_%s_rqmc.npz" %(divergence),loss=np.abs(loss_rqmc))

0
5
10
15
20
RQMC:
[5.37293655e-02 1.48359990e-02 3.27543503e-03 6.64881582e-04
 1.11745062e-04 1.68251383e-05 2.81478487e-06 3.57861618e-07
 6.33573016e-08 7.21522113e-09 1.04696720e-09]


In [None]:
# MC

if rundiv:
  loss_mc = np.zeros((len(n),num))
  for rep in range(num):
    if rep % 5 == 0:
      print(rep)      
    for i in range(len(n)):
      # divergence
      if divergence == 'sink':
        loss_mc[i,rep] = sink_loss(x_mc[rep,:n[i],:], y_mc[rep,:n[i],:])
      if divergence == 'w':
        loss_mc[i,rep] = wasserstein_loss(x_mc[rep,:n[i],:], y_mc[rep,:n[i],:])
      if divergence == 'sw':
        loss_mc[i,rep] = sliced_wasserstein_loss(x_mc[rep,:n[i],:], y_mc[rep,:n[i],:])
      if divergence == 'mmd':
        loss_mc[i,rep] = mmd_loss(n[i],n[i],k(x_mc[rep,:n[i],:],x_mc[rep,:n[i],:],l=0.01),k(x_mc[rep,:n[i],:],y_mc[rep,:n[i],:],l=0.01),k(y_mc[rep,:n[i],:],y_mc[rep,:n[i],:],l=0.01))

  print("MC:")    
  print(np.mean(np.abs(loss_mc),axis=1))
  np.savez(path+"vae_loss_%s_mc.npz" %(divergence),loss=np.abs(loss_mc))

0
5
10
15
20
MC:
[1.14176596e-01 5.58370644e-02 2.82380863e-02 1.43520218e-02
 7.15324455e-03 3.60997196e-03 1.69189756e-03 9.14639159e-04
 4.61103698e-04 2.10666427e-04 1.12191447e-04]
