add inverse autoregressive flow classes

Jaan Altosaar · Jaan Altosaar · commit c78cb3f7208d · 2019-01-20T16:11:14.000-05:00
diff --git a/flow.py b/flow.py
@@ -0,0 +1,152 @@
+"""Credit: mostly based on Ilya's excellent implementation here: https://github.com/ikostrikov/pytorch-flows"""
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+class InverseAutoregressiveFlow(nn.Module):
+  """Inverse Autoregressive Flows with LSTM-type update. One block.
+  
+  Eq 11-14 of https://arxiv.org/abs/1606.04934
+  """
+  def __init__(self, num_input, num_hidden, num_context):
+    super().__init__()
+    self.made = MADE(num_input=num_input, num_output=num_input * 2,
+                     num_hidden=num_hidden, num_context=num_context)
+    # init such that sigmoid(s) is close to 1 for stability
+    self.sigmoid_arg_bias = nn.Parameter(torch.ones(num_input) * 2)
+    self.sigmoid = nn.Sigmoid()
+    self.log_sigmoid = nn.LogSigmoid()
+
+  def forward(self, input, context=None):
+    m, s = torch.chunk(self.made(input, context), chunks=2, dim=-1)
+    s = s + self.sigmoid_arg_bias 
+    sigmoid = self.sigmoid(s)
+    z = sigmoid * input + (1 - sigmoid) * m
+    return z, -self.log_sigmoid(s)
+
+
+class FlowSequential(nn.Sequential):
+  """Forward pass."""
+
+  def forward(self, input, context=None):
+    total_log_prob = torch.zeros_like(input, device=input.device)
+    for block in self._modules.values():
+      input, log_prob = block(input, context)
+      total_log_prob += log_prob
+    return input, total_log_prob
+
+
+class MaskedLinear(nn.Module):
+  """Linear layer with some input-output connections masked."""
+  def __init__(self, in_features, out_features, mask, context_features=None, bias=True):
+    super().__init__()
+    self.linear = nn.Linear(in_features, out_features, bias)
+    self.register_buffer("mask", mask)
+    if context_features is not None:
+      self.cond_linear = nn.Linear(context_features, out_features, bias=False)
+
+  def forward(self, input, context=None):
+    output =  F.linear(input, self.mask * self.linear.weight, self.linear.bias)
+    if context is None:
+      return output
+    else:
+      return output + self.cond_linear(context)
+
+
+class MADE(nn.Module):
+  """Implements MADE: Masked Autoencoder for Distribution Estimation.
+
+  Follows https://arxiv.org/abs/1502.03509
+
+  This is used to build MAF: Masked Autoregressive Flow (https://arxiv.org/abs/1705.07057).
+  """
+  def __init__(self, num_input, num_output, num_hidden, num_context):
+    super().__init__()
+    # m corresponds to m(k), the maximum degree of a node in the MADE paper
+    self._m = []
+    self._masks = []
+    self._build_masks(num_input, num_output, num_hidden, num_layers=3)
+    self._check_masks()
+    modules = []
+    self.input_context_net = MaskedLinear(num_input, num_hidden, self._masks[0], num_context)
+    modules.append(nn.ReLU())
+    modules.append(MaskedLinear(num_hidden, num_hidden, self._masks[1], context_features=None))
+    modules.append(nn.ReLU())
+    modules.append(MaskedLinear(num_hidden, num_output, self._masks[2], context_features=None))
+    self.net = nn.Sequential(*modules)
+
+  def _build_masks(self, num_input, num_output, num_hidden, num_layers):
+    """Build the masks according to Eq 12 and 13 in the MADE paper."""
+    rng = np.random.RandomState(0)
+    # assign input units a number between 1 and D
+    self._m.append(np.arange(1, num_input + 1))
+    for i in range(1, num_layers + 1):
+      # randomly assign maximum number of input nodes to connect to
+      if i == num_layers:
+        # assign output layer units a number between 1 and D
+        m = np.arange(1, num_input + 1)
+        assert num_output % num_input == 0, "num_output must be multiple of num_input"
+        self._m.append(np.hstack([m for _ in range(num_output // num_input)]))
+      else:
+        # assign hidden layer units a number between 1 and D-1
+        self._m.append(rng.randint(1, num_input, size=num_hidden))
+        #self._m.append(np.arange(1, num_hidden + 1) % (num_input - 1) + 1)
+      if i == num_layers:
+        mask = self._m[i][None, :] > self._m[i - 1][:, None]
+      else:
+        # input to hidden & hidden to hidden
+        mask = self._m[i][None, :] >= self._m[i - 1][:, None]
+      # need to transpose for torch linear layer, shape (num_output, num_input)
+      self._masks.append(torch.from_numpy(mask.astype(np.float32).T))
+
+  def _check_masks(self):
+    """Check that the connectivity matrix between layers is lower triangular."""
+    # (num_input, num_hidden)
+    prev = self._masks[0].t()
+    for i in range(1, len(self._masks)):
+      # num_hidden is second axis
+      prev = prev @ self._masks[i].t()
+    final = prev.numpy()
+    num_input = self._masks[0].shape[1]
+    num_output = self._masks[-1].shape[0]
+    assert final.shape == (num_input, num_output)
+    if num_output == num_input:
+      assert np.triu(final).all() == 0
+    else:
+      for submat in np.split(final, 
+                             indices_or_sections=num_output // num_input,
+                             axis=1):
+        assert np.triu(submat).all() == 0
+
+  def forward(self, input, context=None):
+    # first hidden layer receives input and context
+    hidden = self.input_context_net(input, context)
+    # rest of the network is conditioned on both input and context
+    return self.net(hidden)
+
+
+
+class Reverse(nn.Module):
+  """ An implementation of a reversing layer from
+  Density estimation using Real NVP
+  (https://arxiv.org/abs/1605.08803).
+
+  From https://github.com/ikostrikov/pytorch-flows/blob/master/main.py
+  """
+
+  def __init__(self, num_input):
+    super(Reverse, self).__init__()
+    self.perm = np.array(np.arange(0, num_input)[::-1])
+    self.inv_perm = np.argsort(self.perm)
+
+  def forward(self, inputs, context=None, mode='forward'):
+    if mode == "forward":
+      return inputs[:, :, self.perm], torch.zeros_like(inputs, device=inputs.device)
+    elif mode == "inverse":
+      return inputs[:, :, self.inv_perm], torch.zeros_like(inputs, device=inputs.device)
+    else:
+      raise ValueError("Mode must be one of {forward, inverse}.")
+
+    
diff --git a/train_variational_autoencoder_pytorch.py b/train_variational_autoencoder_pytorch.py
@@ -15,9 +15,12 @@
 import pathlib
 import h5py
 import random
+import flow
 
 config = """
 latent_size: 128
+variational: flow
+flow_depth: 2
 data_size: 784
 learning_rate: 0.001
 batch_size: 128
@@ -30,30 +33,29 @@
 seed: 582838
 """
 
-
 class Model(nn.Module):
   """Bernoulli model parameterized by a generative network with Gaussian latents for MNIST."""
-  def __init__(self, latent_size, data_size, batch_size, device):
+  def __init__(self, latent_size, data_size):
     super().__init__()
-    self.p_z = torch.distributions.Normal(
-        torch.zeros(latent_size, device=device), 
-        torch.ones(latent_size, device=device))
+    self.register_buffer('p_z_loc', torch.zeros(latent_size))
+    self.register_buffer('p_z_scale', torch.ones(latent_size))
+    self.log_p_z = NormalLogProb()
     self.log_p_x = BernoulliLogProb()
     self.generative_network = NeuralNetwork(input_size=latent_size,
                                             output_size=data_size, 
                                             hidden_size=latent_size * 2)
 
   def forward(self, z, x):
     """Return log probability of model."""
-    log_p_z = self.p_z.log_prob(z).sum(-1)
+    log_p_z = self.log_p_z(self.p_z_loc, self.p_z_scale, z).sum(-1, keepdim=True)
     logits = self.generative_network(z)
     # unsqueeze sample dimension
     logits, x = torch.broadcast_tensors(logits, x.unsqueeze(1))
-    log_p_x = self.log_p_x(logits, x).sum(-1)
+    log_p_x = self.log_p_x(logits, x).sum(-1, keepdim=True)
     return log_p_z + log_p_x
 
     
-class Variational(nn.Module):
+class VariationalMeanField(nn.Module):
   """Approximate posterior parameterized by an inference network."""
   def __init__(self, latent_size, data_size):
     super().__init__()
@@ -73,6 +75,38 @@ def forward(self, x, n_samples=1):
     return z, log_q_z
 
 
+class VariationalFlow(nn.Module):
+  """Approximate posterior parameterized by a flow (https://arxiv.org/abs/1606.04934)."""
+  def __init__(self, latent_size, data_size, flow_depth):
+    super().__init__()
+    hidden_size = latent_size * 2
+    self.inference_network = NeuralNetwork(input_size=data_size, 
+                                           # loc, scale, and context
+                                           output_size=latent_size * 3, 
+                                           hidden_size=hidden_size)
+    modules = []
+    for _ in range(flow_depth):
+      modules.append(flow.InverseAutoregressiveFlow(num_input=latent_size,
+                                                    num_hidden=hidden_size,
+                                                    num_context=latent_size))
+      modules.append(flow.Reverse(latent_size))
+    self.q_z_flow = flow.FlowSequential(*modules)
+    self.log_q_z_0 = NormalLogProb()
+    self.softplus = nn.Softplus()
+
+  def forward(self, x, n_samples=1):
+    """Return sample of latent variable and log prob."""
+    loc, scale_arg, h = torch.chunk(self.inference_network(x).unsqueeze(1), chunks=3, dim=-1)
+    scale = self.softplus(scale_arg)
+    eps = torch.randn((loc.shape[0], n_samples, loc.shape[-1]), device=loc.device)
+    z_0 = loc + scale * eps  # reparameterization
+    log_q_z_0 = self.log_q_z_0(loc, scale, z_0)
+    z_T, log_q_z_flow = self.q_z_flow(z_0, context=h)
+    log_q_z = (log_q_z_0 + log_q_z_flow).sum(-1, keepdim=True)
+    return z_T, log_q_z
+
+
+
 class NeuralNetwork(nn.Module):
   def __init__(self, input_size, output_size, hidden_size):
     super().__init__()
@@ -155,11 +189,17 @@ def evaluate(n_samples, model, variational, eval_data):
   random.seed(cfg.seed)
 
   model = Model(latent_size=cfg.latent_size, 
-                data_size=cfg.data_size,
-                batch_size=cfg.batch_size, 
-                device=device)
-  variational = Variational(latent_size=cfg.latent_size,
-                            data_size=cfg.data_size)
+                data_size=cfg.data_size)
+  if cfg.variational == 'flow':
+    variational = VariationalFlow(latent_size=cfg.latent_size,
+                                  data_size=cfg.data_size,
+                                  flow_depth=cfg.flow_depth)
+  elif cfg.variational == 'mean-field':
+    variational = VariationalMeanField(latent_size=cfg.latent_size,
+                                       data_size=cfg.data_size)
+  else:
+    raise ValueError('Variational distribution not implemented: %s' % cfg.variational)
+
   model.to(device)
   variational.to(device)