# Deep Markov Model

## a deep probabilistic model for sequential data

* build a latent variable model in which the variability and temporal structure of the observations is controlled by the dynamics of the latent variables
* Markov model, in which we have a chain of latent variables, with each latent variable in the chain conditioned on the previous latent variable
* the transition probabilities governing the dynamics of the latent variables as well as the the emission probabilities that govern how the observations are generated by the latent dynamics to be parameterized by (non-linear) neural networks

![](./images/dmm/01.png)


* neural network based model
  * The solid black squares represent non-linear functions parameterized by neural networks
  * black squares appear in two different places: in between pairs of latents and in between latents and observations
  * we can freely choose the dimension of the latent space to suit the problem at hand

* For the music example
  * for state transition, we choose (conditional) gaussian distributions with diagonal covariances
  * For  observation likelihoods, the bernoulli distribution

## Gaussian State Space Models

![](./images/dmm/02.png)

## Deep Markov Models

![](./images/dmm/03.png)
![](./images/dmm/04.png)
![](./images/dmm/05.png)
![](./images/dmm/06.png)


## The Gated Transition and the Emitter

In [3]:
import numpy as np
import torch
import torch.nn as nn

from torch.autograd import Variable

import pyro
from pyro.distributions import Normal
from pyro.infer import SVI
from pyro.optim import Adam

In [4]:
class Emitter(nn.Module):
    """
    Parameterizes the bernoulli observation likelihood p(x_t | z_t)
    """
    def __init__(self, input_dim, z_dim, emission_dim):
        super(Emitter, self).__init__()
        
        # initialize the three linear transformations used in the neural network
        self.lin_z_to_hidden = nn.Linear(z_dim, emission_dim)
        self.lin_hidden_to_hidden = nn.Linear(emission_dim, emission_dim)
        self.lin_hidden_to_input = nn.Linear(emission_dim, input_dim)
        
        # initialize the two non-linearities used in the neural network
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, z_t):
        """
        Given the latent z at a particular time step t we return the vector of
        probabilities `ps` that parameterizes the bernoulli distribution p(x_t|z_t)
        """
        h1 = self.relu(self.lin_z_to_hidden(z_t))
        h2 = self.relu(self.lin_hidden_to_hidden(h1))
        ps = self.sigmoid(self.lin_hidden_to_input(h2))
        return ps

In [None]:
class GatedTransition(nn.Module):
    """
    Parameterizes the gaussian latent transition probability p(z_t | z_{t-1})
    See section 5 in the reference for comparison.
    """
    def __init__(self, z_dim, transition_dim):
        super(GatedTransition, self).__init__()
        
        # initialize the six linear transformations used in the neural network
        self.lin_gate_z_to_hidden = nn.Linear(z_dim, transition_dim)
        self.lin_gate_hidden_to_z = nn.Linear(transition_dim, z_dim)
        self.lin_proposed_mean_z_to_hidden = nn.Linear(z_dim, transition_dim)
        self.lin_proposed_mean_hidden_to_z = nn.Linear(transition_dim, z_dim)
        self.lin_sig = nn.Linear(z_dim, z_dim)
        self.lin_z_to_mu = nn.Linear(z_dim, z_dim)
        
        # modify the default initialization of lin_z_to_mu
        # so that it's starts out as the identity function
        self.lin_z_to_mu.weight.data = torch.eye(z_dim)
        self.lin_z_to_mu.bias.data = torch.zeros(z_dim)
        
        # initialize the three non-linearities used in the neural network
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.softplus = nn.Softplus()

    def forward(self, z_t_1):
        """
        Given the latent z_{t-1} corresponding to the time step t-1
        we return the mean and sigma vectors that parameterize the
        (diagonal) gaussian distribution p(z_t | z_{t-1})
        """
        # compute the gating function and one minus the gating function
        gate_intermediate = self.relu(self.lin_gate_z_to_hidden(z_t_1))
        gate = self.sigmoid(self.lin_gate_hidden_to_z(gate_intermediate))
        one_minus_gate = ng_ones(gate.size()).type_as(gate) - gate
        
        # compute the 'proposed mean'
        proposed_mean_intermediate = self.relu(self.lin_proposed_mean_z_to_hidden(z_t_1))
        proposed_mean = self.lin_proposed_mean_hidden_to_z(proposed_mean_intermediate)
        
        # assemble the actual mean used to sample z_t, which mixes a linear transformation
        # of z_{t-1} with the proposed mean modulated by the gating function
        mu = one_minus_gate * self.lin_z_to_mu(z_t_1) + gate * proposed_mean
        
        # compute the sigma used to sample z_t, using the proposed mean from above as input
        # the softplus ensures that sigma is positive
        sigma = self.softplus(self.lin_sig(self.relu(proposed_mean)))
        
        # return mu, sigma which can be fed into Normal
        return mu, sigma


## Variational inference on DMM

### Non-sequential 에 대한 VI 

![](./images/dmm/07.png)

### Sequential 에 대한 VI

![](./images/dmm/08.png)

![](./images/dmm/09.png)

![](./images/dmm/10.png)

![](./images/dmm/11.png)

### Lower Bound

![](./images/dmm/12.png)

### Learn by stochastic gradient

![](./images/dmm/13.png)

![](./images/dmm/14.png)

### Structure Inference Network

![](./images/dmm/15.png)

![](./images/dmm/16.png)

![](./images/dmm/17.png)

![](./images/dmm/18.png)


In [6]:
class Combiner(nn.Module):
    """
    Parameterizes q(z_t | z_{t-1}, x_{t:T}), which is the basic building block
    of the guide (i.e. the variational distribution). The dependence on x_{t:T} is
    through the hidden state of the RNN (see the pytorch module `rnn` below)
    """
    def __init__(self, z_dim, rnn_dim):
        super(Combiner, self).__init__()
        # initialize the three linear transformations used in the neural network
        self.lin_z_to_hidden = nn.Linear(z_dim, rnn_dim)
        self.lin_hidden_to_mu = nn.Linear(rnn_dim, z_dim)
        self.lin_hidden_to_sigma = nn.Linear(rnn_dim, z_dim)
        # initialize the two non-linearities used in the neural network
        self.tanh = nn.Tanh()
        self.softplus = nn.Softplus()

    def forward(self, z_t_1, h_rnn):
        """
        Given the latent z at at a particular time step t-1 as well as the hidden
        state of the RNN h(x_{t:T}) we return the mean and sigma vectors that
        parameterize the (diagonal) gaussian distribution q(z_t | z_{t-1}, x_{t:T})
        """
        # combine the rnn hidden state with a transformed version of z_t_1
        h_combined = 0.5 * (self.tanh(self.lin_z_to_hidden(z_t_1)) + h_rnn)
        # use the combined hidden state to compute the mean used to sample z_t
        mu = self.lin_hidden_to_mu(h_combined)
        # use the combined hidden state to compute the sigma used to sample z_t
        sigma = self.softplus(self.lin_hidden_to_sigma(h_combined))
        # return mu, sigma which can be fed into Normal
        return mu, sigma

## Guide 

In [7]:
def guide(self, mini_batch, mini_batch_reversed, mini_batch_mask,
          mini_batch_seq_lengths, annealing_factor=1.0):

    # this is the number of time steps we need to process in the mini-batch
    T_max = mini_batch.size(1)
    # register all pytorch (sub)modules with pyro
    pyro.module("dmm", self)

    # if on gpu we need the fully broadcast view of the rnn initial state
    # to be in contiguous gpu memory
    h_0_contig = self.h_0 if not self.use_cuda \
        else self.h_0.expand(1, mini_batch.size(0), self.rnn.hidden_size).contiguous()
    # push the observed x's through the rnn;
    # rnn_output contains the hidden state at each time step
    rnn_output, _ = self.rnn(mini_batch_reversed, h_0_contig)
    # reverse the time-ordering in the hidden state and un-pack it
    rnn_output = poly.pad_and_reverse(rnn_output, mini_batch_seq_lengths)
    # set z_prev = z_q_0 to setup the recursive conditioning in q(z_t |...)
    z_prev = self.z_q_0

    # sample the latents z one time step at a time
    for t in range(1, T_max + 1):
        # get the parameters for the distribution q(z_t | z_{t-1}, x_{t:T})
        z_mu, z_sigma = self.combiner(z_prev, rnn_output[:, t - 1, :])
        # sample z_t from the distribution q(z_t|...)
        z_t = pyro.sample("z_%d" % t, dist.Normal, z_mu, z_sigma,
                          log_pdf_mask=annealing_factor * mini_batch_mask[:, t - 1:t])
        # the latent sampled at this time step will be conditioned upon in the next time step
        # so keep track of it
        z_prev = z_t

## Pyro Model

In [5]:

def model(self, mini_batch, mini_batch_reversed, mini_batch_mask,
          mini_batch_seq_lengths, annealing_factor=1.0):

    # this is the number of time steps we need to process in the mini-batch
    T_max = mini_batch.size(1)

    # register all pytorch (sub)modules with pyro
    pyro.module("dmm", self)

    # set z_prev = z_0 to setup the recursive conditioning
    z_prev = self.z_0

    # sample the latents z and observed x's one time step at a time
    for t in range(1, T_max + 1):
        # the next three lines of code sample z_t ~ p(z_t | z_{t-1})
        # first compute the parameters of the diagonal gaussian distribution p(z_t | z_{t-1})
        z_mu, z_sigma = self.trans(z_prev)
        
        # then sample z_t according to dist.Normal(z_mu, z_sigma)
        z_t = pyro.sample("z_%d" % t, dist.Normal, z_mu, z_sigma,
                          log_pdf_mask=annealing_factor * mini_batch_mask[:, t - 1:t])

        # compute the probabilities that parameterize the bernoulli likelihood
        emission_probs_t = self.emitter(z_t)
        
        # the next statement instructs pyro to observe x_t according to the
        # bernoulli distribution p(x_t|z_t)
        pyro.observe("obs_x_%d" % t, dist.bernoulli, mini_batch[:, t - 1, :],
                     emission_probs_t,
                     log_pdf_mask=mini_batch_mask[:, t - 1:t])
        
        # the latent sampled at this time step will be conditioned upon
        # in the next time step so keep track of it
        z_prev = z_t

## Packaging the Model and Guide as a Pytorch Module

In [9]:
class DMM(nn.Module):
    """
    This pytorch Module encapsulates the model as well as the
    variational distribution (the guide) for the Deep Markov Model
    """
    def __init__(self, input_dim=88, z_dim=100, emission_dim=100,
                 transition_dim=200, rnn_dim=600, rnn_dropout_rate=0.0,
                 num_iafs=0, iaf_dim=50, use_cuda=False):
        super(DMM, self).__init__()
        # instantiate pytorch modules used in the model and guide below
        self.emitter = Emitter(input_dim, z_dim, emission_dim)
        self.trans = GatedTransition(z_dim, transition_dim)
        self.combiner = Combiner(z_dim, rnn_dim)
        self.rnn = nn.RNN(input_size=input_dim, hidden_size=rnn_dim, nonlinearity='relu',
                          batch_first=True, bidirectional=False, num_layers=1, dropout=rnn_dropout_rate)

        # define a (trainable) parameters z_0 and z_q_0 that help define the probability
        # distributions p(z_1) and q(z_1)
        # (since for t = 1 there are no previous latents to condition on)
        self.z_0 = nn.Parameter(torch.zeros(z_dim))
        self.z_q_0 = nn.Parameter(torch.zeros(z_dim))
        # define a (trainable) parameter for the initial hidden state of the rnn
        self.h_0 = nn.Parameter(torch.zeros(1, 1, rnn_dim))

        self.use_cuda = use_cuda
        # if on gpu cuda-ize all pytorch (sub)modules
        if use_cuda:
            self.cuda()

    # the model p(x_{1:T} | z_{1:T}) p(z_{1:T})

    def model(self, mini_batch, mini_batch_reversed, mini_batch_mask,
              mini_batch_seq_lengths, annealing_factor=1.0):

        # this is the number of time steps we need to process in the mini-batch
        T_max = mini_batch.size(1)

        # register all pytorch (sub)modules with pyro
        pyro.module("dmm", self)

        # set z_prev = z_0 to setup the recursive conditioning
        z_prev = self.z_0

        # sample the latents z and observed x's one time step at a time
        for t in range(1, T_max + 1):
            # the next three lines of code sample z_t ~ p(z_t | z_{t-1})
            # first compute the parameters of the diagonal gaussian distribution p(z_t | z_{t-1})
            z_mu, z_sigma = self.trans(z_prev)

            # then sample z_t according to dist.Normal(z_mu, z_sigma)
            z_t = pyro.sample("z_%d" % t, dist.Normal, z_mu, z_sigma,
                              log_pdf_mask=annealing_factor * mini_batch_mask[:, t - 1:t])

            # compute the probabilities that parameterize the bernoulli likelihood
            emission_probs_t = self.emitter(z_t)

            # the next statement instructs pyro to observe x_t according to the
            # bernoulli distribution p(x_t|z_t)
            pyro.observe("obs_x_%d" % t, dist.bernoulli, mini_batch[:, t - 1, :],
                         emission_probs_t,
                         log_pdf_mask=mini_batch_mask[:, t - 1:t])

            # the latent sampled at this time step will be conditioned upon
            # in the next time step so keep track of it
            z_prev = z_t

    # the guide q(z_{1:T} | x_{1:T}) (i.e. the variational distribution)
    def guide(self, mini_batch, mini_batch_reversed, mini_batch_mask,
              mini_batch_seq_lengths, annealing_factor=1.0):

        # this is the number of time steps we need to process in the mini-batch
        T_max = mini_batch.size(1)
        # register all pytorch (sub)modules with pyro
        pyro.module("dmm", self)

        # if on gpu we need the fully broadcast view of the rnn initial state
        # to be in contiguous gpu memory
        h_0_contig = self.h_0 if not self.use_cuda \
            else self.h_0.expand(1, mini_batch.size(0), self.rnn.hidden_size).contiguous()
        # push the observed x's through the rnn;
        # rnn_output contains the hidden state at each time step
        rnn_output, _ = self.rnn(mini_batch_reversed, h_0_contig)
        # reverse the time-ordering in the hidden state and un-pack it
        rnn_output = poly.pad_and_reverse(rnn_output, mini_batch_seq_lengths)
        # set z_prev = z_q_0 to setup the recursive conditioning in q(z_t |...)
        z_prev = self.z_q_0

        # sample the latents z one time step at a time
        for t in range(1, T_max + 1):
            # get the parameters for the distribution q(z_t | z_{t-1}, x_{t:T})
            z_mu, z_sigma = self.combiner(z_prev, rnn_output[:, t - 1, :])
            # sample z_t from the distribution q(z_t|...)
            z_t = pyro.sample("z_%d" % t, dist.Normal, z_mu, z_sigma,
                              log_pdf_mask=annealing_factor * mini_batch_mask[:, t - 1:t])
            # the latent sampled at this time step will be conditioned upon in the next time step
            # so keep track of it
            z_prev = z_t

## Stochastic Variational Inference

In [11]:
# instantiate the dmm
dmm = DMM(input_dim, z_dim, emission_dim, transition_dim, rnn_dim,
          args.rnn_dropout_rate, args.num_iafs, args.iaf_dim, args.cuda)

# setup optimizer
adam_params = {"lr": args.learning_rate, "betas": (args.beta1, args.beta2),
               "clip_norm": args.clip_norm, "lrd": args.lr_decay,
               "weight_decay": args.weight_decay}
optimizer = ClippedAdam(adam_params)

NameError: name 'input_dim' is not defined

In [None]:
# setup inference algorithm
svi = SVI(dmm.model, dmm.guide, optimizer, "ELBO", trace_graph=False)