# Variational Autoencoder

The variational autoencoder (VAE) aims to learn the distribution of a dataset $\mathbf{x}_1, \mathbf{x}_2, \dots, \mathbf{x}_n$ given they are sampled conditioned on a latent $\mathbf{z}$ sampled as a standard Gaussian vector. Below, we show a basic implementation of a VAE that will be the basis for unsupervised generation down the road.

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import random
from tqdm import tqdm
import torchvision
import torchvision.datasets as datasets

#for consistency, all seeds are set to 69420
seed = 69420
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

## Encoder

The encoder aims to model $q(\mathbf{z}|\mathbf{x};\phi)$. This network produces the mean and covariance using $\mathbf{x}$, from where $\mathbf{z}$ can be sampled. The implementation uses convolutional neural network components to map $\mathbf{x}$ to the means and covariances. A trick used here is to sample the covariance from the logarithmic space to ensure positivity.

In [None]:
class Encoder(nn.Module):
    def __init__(self, latent_size, img_channel, components_num):
        super(Encoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Conv2d(img_channel, 32, 4, 2, 1, bias=False),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2, inplace=True),
            
            nn.Conv2d(32, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(128, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
        )

        #compute the mean and covariance for all components
        self.mu_fc = nn.Conv2d(256, components_num * latent_size, 2, 1, 0, bias=False)
        self.sigma_fc = nn.Conv2d(256, components_num * latent_size, 2, 1, 0, bias=False) #include/not include number of components?
        self.weights_fc = nn.Conv2d(256, components_num, 2, 1, 0, bias=False)
        self.kl = 0
        
    def forward(self, x, latent_size):
        encoded = self.encoder(x)
        mu = self.mu_fc(encoded).view(encoded.size(0), -1, latent_size) #mean
        sigma = torch.exp(self.sigma_fc(encoded).view(encoded.size(0), -1, latent_size)) #covariance
        weights = torch.softmax(self.weights_fc(encoded).view(encoded.size(0), -1, latent_size), dim=1) #weights

        z = self.reparameterize(mu, sigma)
        self.kl = self.kl_loss(mu, sigma, weights) # kl loss term

        return z, mu, sigma, weights
    
    #reparameterization trick
    def reparameterize(self, mu, sigma):
        sd = torch.sqrt(sigma + 1e-8)
        noise = torch.randn_like(sd)
        z = mu + sd * noise
        return z
    

    # KL divergence loss
    def kl_loss(self, mu, sigma, weights):
        kl_component = 0.5 * torch.sum(sigma**2 + mu**2 - torch.log(sigma) - 1)
        kl = torch.mean(torch.sum(weights * kl_component))
        return kl

# Decoder

On the other hand, the decoder learns $p(\mathbf{x}|\mathbf{z};\theta)$ by reconstructing the give data $x$ using information it was given from $\mathbf{z}$. Implementation-wise, this decoder uses convolution transpose blocks to reverse the convolution of the encoder.

In [12]:
class Decoder(nn.Module):
    
    def __init__(self, latent_size, img_channel):
        
        super(Decoder, self).__init__()

        self.conv_transpose_block_1 = nn.Sequential(
            nn.ConvTranspose2d(latent_size, 256, 4, 1, 0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True))

        self.conv_transpose_block_2 = nn.Sequential(
            nn.ConvTranspose2d(256, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True))

        self.conv_transpose_block_3 = nn.Sequential(
            nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2, inplace=True))

        self.conv_transpose_block_4 = nn.Sequential(
            nn.ConvTranspose2d(64, 32, 4, 2, 1, bias=False),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2, inplace=True))

        self.conv_transpose_block_5 = nn.Sequential(
            nn.ConvTranspose2d(32, img_channel, 1, 1, 0, bias=False),
            nn.Sigmoid())

    def forward(self, x):
        x = self.conv_transpose_block_1(x)
        x = self.conv_transpose_block_2(x)
        x = self.conv_transpose_block_3(x)
        x = self.conv_transpose_block_4(x)
        x = self.conv_transpose_block_5(x)
        return x

## VAE training
We optimize the ELBO stochastically using the gradient descent, which boils down the minimizing the KL divergence and 