In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle

from scipy.stats import multivariate_normal
from sklearn.cluster import KMeans

from utils import color_cycle

# Lecture 7: Mixture models and Expectation Maximization	

* Clustering and Latent variable models 
* Mixture models
* Expectation Maximization

Murphy **Chapter 11** (sections 11.1, 11.2, 11.3, 11.4, 11.4.1, 11.4.2, 11.4.7)

_Recommended reading_:
* Christopher Bishop's [Pattern Recognition and Machine Learning](https://www.microsoft.com/en-us/research/people/cmbishop/prml-book/) **Chapter 9**
* Have a look at the scikit-learn demo on [Comparing different clustering algorithms on toy datasets](https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html)

## A first look at a clustering problem: K-means

#### Load old faithful dataset

In [None]:
points_oldfaith = np.loadtxt("datasets/oldFaith.txt")
points_oldfaith = (points_oldfaith - points_oldfaith.mean(0)[None]) / points_oldfaith.std(0)[None]
plt.scatter(points_oldfaith[:,0], points_oldfaith[:,1]);

### K-means in scikit-learn

In [None]:
# run K-means
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, n_init='auto', verbose=1, random_state=1).fit(points_oldfaith)
centers_sklearn = kmeans.cluster_centers_
labels_sklearn = kmeans.labels_

# plot clustering
for k in range(num_clusters):
    plt.plot(centers_sklearn[k,0], centers_sklearn[k,1], 'x', ms=10)
    plt.scatter(points_oldfaith[labels_sklearn==k,0], points_oldfaith[labels_sklearn==k,1], alpha=0.5)

### K-means: do it yourself

```
initialize K centroid vectors (cluster centers) m_k

for iter from 1 to max_iter or until convergence:

    for mu from 1 to N:
        # Assign each data point to its closest cluster center
        k_mu ← arg min || x_mu − m_k ||

    for k from 1 to K:
        # Update each cluster center m_k by computing the mean of all points assigned to it
```

In [None]:
num_clusters = 2
num_steps = 100
tol = 1e-1
seed = 20
keep_story = True

center = None
center = np.array([[-1,1],
                  [1,-1.1]])

# ...you will write your own implementation of k_means...
# res_kmeans = mixture_models_code.k_means(points_oldfaith, num_clusters, num_steps, tol,
#                                          center=center, seed=seed, verbose=True, keep_story=keep_story)
# pickle.dump(res_kmeans, open("datasets/res_kmeans.p", "wb" ))

# load results obtained with my own kmeans implementation
res_kmeans = pickle.load(open("datasets/res_kmeans.p", "rb"))
resp, center, dist_centers, distortions, resps, centers = res_kmeans

#### Visualize trajectory of centroids

In [None]:
plt.scatter(points_oldfaith[:,0], points_oldfaith[:,1], alpha=0.2)
for k in range(num_clusters):
    plt.plot(centers[0,k,0], centers[0,k,1], 'o', c=color_cycle[k]);
    plt.plot(centers[:,k,0], centers[:,k,1], '.-', c=color_cycle[k]);
    plt.plot(centers[-1,k,0], centers[-1,k,1], 'x', ms=10, c=color_cycle[k]);

#### Visualize trajectory in details with cluster assignments through iterations

In [None]:
tot_steps = len(resps)

fig, axs = plt.subplots(tot_steps, 1)
fig.set_size_inches(10, 3 * tot_steps)
for step, (resp, center) in enumerate(zip(resps, centers)):
    for k in range(num_clusters):
        axs[step].set_aspect('equal', 'box')
        axs[step].plot(center[k,0], center[k,1], 'x', ms=8)
        axs[step].scatter(points_oldfaith[resp==k,0], points_oldfaith[resp==k,1], alpha=0.1)
    for k1 in range(num_clusters):
        for k2 in range(k1 + 1, num_clusters):
            fromto = center[[k1,k2]]
            middle_point = fromto.mean(0)
            axs[step].plot(fromto[:,0], fromto[:,1], ':', c="black", alpha=0.5)
            axs[step].plot(middle_point[0], middle_point[1], 'o', c="black", ms=2, alpha=0.5)
            dk1k2 = fromto[1] - fromto[0]
            orth_dk1k2 = np.array([-dk1k2[1], dk1k2[0]])
            orth_dk1k2 /= np.sqrt((orth_dk1k2**2).sum())
            axs[step].plot([middle_point[0]-orth_dk1k2[0],middle_point[0]+orth_dk1k2[0]],
                     [middle_point[1]-orth_dk1k2[1],middle_point[1]+orth_dk1k2[1]], color="black", alpha=0.5)
            
plt.tight_layout();

We can visualize the progression of K-means looking at a _cost function_ representing the **distortion** in a code based on the centroids only:

$$
J=\frac{1}{N}\sum_{i=1}^{N}\left\Vert x_{\mu}-m_{k_{\mu}}\right\Vert ^{2}
$$

Have a look at Murphy section 11.4.2.6 and Figure 9.2 in Bishop's book.

In [None]:
plt.figure(figsize=(12,4))

plt.subplot(121)
colors = np.zeros((len(distortions), 3))
colors[::2,2] = 1
colors[1::2,0] = 1
plt.plot(np.arange(len(distortions)), distortions, c=(0,1,0), alpha=0.5);
plt.scatter(np.arange(len(distortions)), distortions, c=colors);
plt.xlabel("E/M step")
plt.ylabel("distortion");

plt.subplot(122)
plt.plot(dist_centers, '.-');
plt.xlabel("steps")
plt.ylabel("center displacement");

plt.tight_layout();

print("Left panel")
print("Blue points: compute assignments to centroids (E step)")
print("Red points: recompute centroids (M step)")

#### Run K-means a bunch of times and look at the attractors of the algorithms

In [None]:
num_seeds = 1000

# here's how I produced the results from many different initializations
# allcenters = []
# for seed in range(num_seeds):
#     center = 4 * np.random.rand(num_clusters, 2) - 2
#     _, _, _, _, _, centers = mixture_models_code.k_means(points_oldfaith, num_clusters, num_steps, tol,
#                                                          center=center, seed=seed, keep_story=keep_story)
#     allcenters.append(centers)
# pickle.dump(allcenters, open("datasets/allcenters.p", "wb" ))

# load results obtained with my own kmeans implementation
allcenters = pickle.load(open("datasets/allcenters.p", "rb"))
k = 0
for centers in allcenters:
    plt.plot(centers[0,k,0], centers[0,k,1], '.', c=color_cycle[k], alpha=0.05);
    plt.plot(centers[:,k,0], centers[:,k,1], '-', c=color_cycle[k], alpha=0.1);

## Latent variable models

There are two approaches to model correlations in data:
* Use **observable variables only**, with complex interactions between them
* Use **additional latent variables**  as in **Latent Variable Models** (LVMs)

Latent variable models have fewer parameters and require less data to learn, _but_ they are harder to learn.

<center> <img src="figs/latent_variable_models.png" width=300></center>

Left: $3+8+2+2+2=17$. Right: $3+8+16+32=59$.

## Clustering as a problem with latent variables

Consider a model:
$$p(x,k)=p(k)p(x|k)$$
with $x$ observed (continuous or real) and $k=1,\ldots,K$ a discrete latent variable.
The log likelihood is
$$\log L = \sum_\mu \log p(x^\mu)\qquad p(x^\mu)=\sum_k p(x^\mu,k)$$

The sum expresses that different clusters contribute to the probability of the data point $x^\mu$.
In (hard) clustering, the sum is replaced by the single most contributing term (NB: $k^\mu$ depends on the parameters of the model and changes during the algorithm):
$$p(x^\mu)=\sum_k p(x^\mu,k)\approx p(x^\mu,k^\mu)\qquad k^\mu = \text{argmax}_k p(x^\mu,k)$$
$$\log L \approx \sum_\mu \log p(k^\mu) + \sum_\mu \log p(x^\mu|k^\mu)$$

## Mixtures of multinoullis

A distribution over $d$ dimensional binary vectors $x=(x_1,\ldots,x_d)$ with $x_i=0,1$.
$$p(x|\theta)=\sum_{k=1}^K \pi_k p(x|k) \qquad p(x|k)=\prod_{j=1}^d \mu_{jk}^{x_j}(1-\mu_{jk})^{1-x_j}$$
with $\theta=\{\pi_k,\mu_{jk}\}$ and $\sum_k \pi_k=1$ and $0 \le \mu_{jk}\le 1$.

We derive a clustering algorithm by adding Lagrange multipliers:
$$\log L \approx \sum_\mu \log \pi_{k^\mu} +\sum_{\mu,j} \left(x_j^\mu \log \mu_{jk^\mu} +(1-x_j^\mu)\log (1-\mu_{jk^\mu}\right)+ \lambda \left(\sum_k \pi_k -1\right)$$
with $k^\mu = \text{argmax}_k p(x^\mu,k)$.

## Mixtures of multinullis

Setting the derivatives to zero, we obtain:
$$\frac{\partial \log L}{\partial \pi_k} =\frac{1}{\pi_k}\sum_\mu \delta_{k,k^\mu} +\lambda=0\quad\rightarrow\quad \pi_k=\frac{N_k}{N}$$with $N_k=\sum_\mu \delta_{k,k^\mu}$ the number of data samples that are assigned to cluster $k$ and $N=\sum_k N_k$ the total number of data samples.$$\frac{\partial \log L}{\partial \mu_{jk}}=\frac{1}{\mu_{jk}}\sum_\mu x_j^\mu \delta_{k,k^\mu} -\frac{1}{1-\mu_{jk}}\sum_\mu (1-x_j^\mu) \delta_{k,k^\mu}$$
$$=N_k\left(\frac{m_{jk}}{\mu_{jk}}-\frac{1-m_{jk}}{1-\mu_{jk}}\right)=0 \quad\rightarrow\quad \mu_{jk}=m_{jk}$$with$$m_{jk}=\frac{1}{N_k}\sum_\mu x_j^\mu \delta_{k,k^\mu}=\frac{1}{N_k}\sum_{\mu \in k}x_j^\mu$$
the mean of the data that is assigned to cluster $k$.

## Mixtures of multinullis: pseudo code

```

initialize pi_k
initialize mu_jk

for iter from 1 to max_iter:
    
    for mu from 1 to N:
        k_mu ← argmax(p_mu_k)
        
    for k from 1 to K:
        compute Nk
        pi_k ← N_k / N
        
    for k from 1 to K:
        
        for j from 1 to d:
            compute m_jk
            mu_jk ← m_jk
```

## Mixture of Bernoullis on MNIST

#### Load MNIST dataset (the lazy way)

In [10]:
# torch
import torch
import torchvision
import torchvision.transforms as transforms

# CONSTRUCT MY OWN MNIST LOADER
kwargs = {'num_workers': 0, 'pin_memory': False}

size = 28

# extract all dataset
train_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST('/home/ai/repos/conv_emerge/data', # change this to your data directory
                               train=True,
                               download=False,
                               transform=transforms.Compose([
                                   # transforms.RandomAffine(0, translate=(1,1)),
                                   # transforms.Resize(size),
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,)) # MNIST params
                               ])),
    batch_size=60000, shuffle=False, **kwargs)

_, (X, y) = next(enumerate(train_loader))
X.squeeze_();

In [11]:
# select the first three digits...
X = torch.vstack([X[y==0], X[y==1], X[y==2]])
y = torch.cat([y[y==0], y[y==1], y[y==2]])
perm = np.random.permutation(len(X))
X = X[perm]
y = y[perm]

# ...and extract a smaller chunck and binarize
num_data = 1000
data = X[:num_data].flatten(1).numpy()
data = 1. * (data > 0.5)

In [None]:
# have a look at some digits
i = 16
plt.imshow(data[i].reshape(size,size))
print("label:", y[i].item());

#### Fit mixture to data

In [None]:
num_mixture = 3
num_steps = 20
tol = 1e-2
seed = 1
verbose = True
keep_story = True

pi = 1/num_mixture * np.ones(num_mixture)
mu = np.random.rand(num_mixture, data.shape[-1]) * 0.5 + 0.25

# you will write your own implementation of EM
# res_bernoulli = mixture_models_code.em_bernoulli(data, num_mixture, num_steps, tol, pi=pi, mu=mu,
#                                                   seed=seed, verbose=True, keep_story=keep_story)
# pickle.dump(res_bernoulli, open("datasets/res_bernoulli.p", "wb" ))

# load results obtained with my own EM implementation
res_bernoulli = pickle.load(open("datasets/res_bernoulli.p", "rb"))
resp, mu, logliks, resps, mus, pis = res_bernoulli

plt.plot(logliks, '.-')
plt.xlabel('iteration')
plt.ylabel('log likelihood');

#### Visualize mean vectors

In [None]:
plt.figure(figsize=(20,4))
for k in range(num_mixture):
    plt.subplot(1, num_mixture, k + 1)
    plt.imshow(mu[k].reshape(size,size));
plt.tight_layout();

## Mixtures of multinomials with all 10 digits

<center><img src="figs/multinullis_murphy.png" width=400></center>

Result is not very good:
* Some digits are not represented, others are doubled
* Result is not reproducible: different initializations yield different results
* Some digits might require multiple clusters (different ways to write a 4)

The assumption that all pixels are independent given the cluster is **too simple**.

## Mixtures of Gaussians

A distribution over $d$ dimensional continuous vectors.
$$p(x|\theta)=\sum_{k=1}^K \pi_k \mathcal{N}(x|\mu_k,\Sigma_k)$$
with $\theta=\{\pi_k, \mu_k,\Sigma_k\}$.


#### Let's write a function to draw from a Gaussian Mixture Model

In [15]:
def draw_mgm(mixing_coeff, mus, sigmas, num_samples):
    if np.abs(mixing_coeff.sum() - 1.) > 1e-10:
        raise ValueError("mixing coefficients do not sum to 1!")
    num_mixture = len(mixing_coeff)
    if len(mus.shape) == 1:
        dim = 1
        points = np.zeros(num_samples)
    else:
        dim = mus.shape[1]
        points = np.zeros((num_samples, dim))
    coeff_draw = np.random.choice(num_mixture, p=mixing_coeff, size=num_samples)
    num_class = np.zeros(num_mixture)
    for k in range(num_mixture):
        num_class[k] = (coeff_draw == k).sum()
        if dim > 1:
            points[coeff_draw==k] = np.random.multivariate_normal(mus[k], sigmas[k], size=int(num_class[k]))
        else:
            points[coeff_draw==k] = mus[k] + sigmas[k] * np.random.randn(int(num_class[k]))
    return points, coeff_draw, num_class

#### Draw data from three Gaussians (Cfr. Figure 9.5 in Bishop's book)

In [16]:
# select number and points, missing coefficients and cluster means
num_samples = 500

mixing_coeff = np.array([0.5, 0.3, 0.2])

mus = np.array([[0.25, 0.45],
                [0.5, 0.5],
                [0.8, 0.6]])

# compute covariance matrices from eigenvectors and eigenvalues
UT = np.array([[1, 1],
               [-1, 1]]) / np.sqrt(2)

lambdas = np.array([[0.2**2,0.05**2],
                    [0.05**2, 0.2**2],
                    [0.2**2,0.05**2]])

sigmas = np.einsum('ki,mk,kj->mij', UT, lambdas, UT)

In [None]:
# draw points
points, coeff_draw, num_class = draw_mgm(mixing_coeff, mus, sigmas, num_samples)

plt.figure(figsize=(10,4))

plt.subplot(121)
for k in range(3):
    ind = coeff_draw==k
    plt.scatter(points[ind,0], points[ind,1], alpha=0.5);

plt.subplot(122)
plt.scatter(points[:,0], points[:,1], alpha=0.5);

## Expectation Maximization

Consider a model $p(x,k|\theta)$.
The problem is to find $\theta$ that maximizes the data likelihood on the observed data $x$:
$$L(\theta)=\sum_x D(x)\log p(x|\theta)=\sum_x D(x) \log \sum_k p(x,k|\theta)$$
with $D(x)$ the empirical distribution of $x$ (The empirical distribution is the distribution implied by the data set: $D(x)=\frac{1}{N}\sum_{\mu=1}^N \delta_{x,x^\mu}$ and $L(\theta)=\frac{1}{N}\sum_\mu \log p(x^\mu|\theta)$. The $\log \sum$ makes optimization hard.

Instead of maximizing $L$, we compute a lower bound and maximize that. For given $x$:
$$L_x(\theta)=\log \sum_k p(x,k|\theta)=\log \sum_k q_x(k)\frac{ p(x,k|\theta)}{q_x(k)}\ge \sum_k q_x(k) \log \frac{p(x,k|\theta)}{q_x(k)}=Q_x(\theta,q)$$This is called a **variational** or **Jensen bound**.

The bound can be understood in terms of KL divergence:$$L_x(\theta)-Q_x(\theta,q)=\log p(x|\theta)-\sum_k q_x(k) \log \frac{p(x,k|\theta)}{q_x(k)}= \sum_k q_x(k) \log \frac{q_x(k)}{p(k|x,\theta)} =KL(q_x|p(\cdot|x,\theta))$$
The bound $L_x(\theta)\ge Q_x(\theta,q)$ thus follows also from $KL\ge 0$.

**E step:** Maximizing $Q_x(\theta_t,q)$ (or minimizing $KL$) with respect to $q_x$ gives
$$q_x^*(k)=p(k|x,\theta_t)\qquad KL(q_x^*(k)|p(k|x,\theta_t))=0$$
with $\theta_t$ the parameters at iteration $t$ of the algorithm. $r^\mu_k= q^*_{x^\mu}(k)$ is called the **responsibility** of cluster $k$ for data point $x^\mu$.
NB: we have a different distribution $q_x$ for each data point $x$.
NB: $KL(q_x^*(k)|p(k|x,\theta_t))=0$ can only be obtained when variational model $q$ is sufficiently expressive.

Substitution gives
$$Q_x(\theta,q_x^*)=\sum_k p(k|x,\theta_t)\log \frac{p(x,k|\theta)}{p(k|x,\theta_t)}$$Note that:$$Q(\theta_t,q^*)=L(\theta_t)$$

**M step:** maximize $Q(\theta)=\sum_x D(x) Q_x(\theta,q_x^*)$ wrt $\theta$.
$$\theta_{t+1} =\text{argmax}_{\theta} \sum_x D(x) Q_x(\theta,q_x^*)=\text{argmax}_{\theta}\sum_x D(x) \sum_k p(k|x,\theta_t)\log p(x,k|\theta)$$
Note that the $\sum_k$ now appears outside the log, whereas in the original log likelihood $L(\theta)=\sum_x D(x) \log \sum_k p(x,k|\theta)$ it appears inside the log.

$\log p(x|\theta)$ is concave in $\theta$ because $p(x,k|\theta)$ is an exponential model.
As a result, the maximization in the M step can be easily computed (for exponential models).

## Convexity of $\log p(x|\theta)$ for an exponential model

Consider an exponential model
$$p(x|\theta)=\exp\left(\sum_a\theta_a \phi_a(x) - \log Z(\theta) \right)\qquad Z=\sum_x \exp\left(\sum_a\theta_a \phi_a(x) \right)$$It is easy to show that$$\frac{\partial \log Z}{\partial \theta_a}=\langle\phi_a\rangle\qquad \frac{\partial^2 \log Z}{\partial \theta_a\partial \theta_b}=\langle\phi_a\phi_b\rangle-\langle\phi_a\rangle\langle\phi_b\rangle=C_{ab}$$
The **covariance matrix** $C$ is positive semi-definite ($\sum_{ab}v_a C_{ab}v_b\ge 0$ for all vectors $v_a$). Therefore $\log Z(\theta)$ is convex in $\theta$. Since $\log p(x|\theta) =\sum_a\theta_a \phi_a(x) - \log Z(\theta)$ we conclude that $p(x|\theta)$ is concave in $\theta$.

#### EM increases the data log likelihood

For subsequent EM steps we have:
$$L(\theta_{t})=Q(\theta_{t},q^*_t) \le Q(\theta_{t+1},q^*_t) \le Q(\theta_{t+1},q^*_{t+1}) =L(\theta_{t+1})$$
First and second equality because the model $q^*$ is sufficiently expressive.
First inequality because of M-step/maximization.
Second inequality because of E-step/Jensen's bound.

<center> <img src="figs/em_likelihood.png" width=300></center>

Blue: $Q(\theta,q_t^*)$ with $Q(\theta_t,q_t^*)=L(\theta_t)$ (E step). Maximization yields $\theta_{t+1}$ (M step).
Green: $Q(\theta,q_t^*)$ with $Q(\theta_{t+1},q_t^*)=L(\theta_{t+1})$. Maximization yields $\theta_{t+2}$.

## Maximum likelihood via EM for mixture of Gaussians
The model is
$$p(x,k)=p(k)p(x|k)\qquad p(k) =\pi_k \qquad p(x|k)=\mathcal{N}(x|\mu_k,\Sigma_k)$$
with parameters $\pi_k,\mu_k,\Sigma_k$.

**E step:** For each data point, compute the responsibility
$$q_\mu^*(k)=p(k|x^\mu,\theta_t)=\frac{\pi_k \mathcal{N}(x^\mu|\mu_k,\Sigma_k)}{\sum_{k'} \pi_{k'} \mathcal{N}(x^\mu|\mu_{k'},\Sigma_{k'})}=r_{\mu k}\qquad k=1,\ldots, K$$
$p(k|x^\mu,\theta_t)$ does a 'soft assignment' of data point $x^\mu$ to Gaussian component $k$.

## Maximum likelihood via EM for mixture of Gaussians

**M step:**
$$\theta_{t+1} =\text{argmax}_\theta \sum_{x}D(x) \sum_k q^*_x(k) \log p(x,k|\theta)$$
$$= \text{argmax}_\theta \sum_{\mu,k} r_{\mu k} \left(\log \pi_k +\log \mathcal{N}(x^\mu | \mu_k,\Sigma_k)\right)$$with $\theta_{t+1}=\{\pi_k,\mu_k,\Sigma_k\}_{t+1}$. Define $r_k=\sum_\mu r_{\mu k}$. The solution is$$\pi_k = \frac{r_k}{N}\qquad (\mu_k)_i = \frac{\sum_\mu r_{\mu k} x_i^\mu}{r_k}\qquad \left(\Sigma_k\right)_{ij} =\frac{\sum_\mu r_{\mu k}x^\mu_i x^\mu_j}{r_k}-(\mu_k)_i(\mu_k)_j$$
This solution makes sense: the cluster mean and covariance are weighted sums of the points assigned to the cluster.

## Inference in a MGM

In [None]:
num_mixture = 2
num_steps = 100
tol = 1e-2
seed = 1
verbose = True
keep_story = True
init_from_kmeans = False

if init_from_kmeans: # init mu from k means
    print("running k-means")
    # no free lunch: you will use your own kmeans code to initialize EM
    mu, sigma, pi = mixture_models_code.init_from_k_means(points_oldfaith, num_mixture, num_steps, tol,
                                                          center=None, seed=seed, verbose=True, keep_story=False)
    
else: # init randomly
    pi = 1/num_mixture * np.ones(num_mixture)
    mu = np.array([[-1.1,1],
                   [1,-1]])
    sigma = np.zeros((num_mixture, 2, 2))
    for k in range(num_mixture):
        sigma[k] = np.eye(2)

# you will write your own implementation of EM for MGM
# res_mgm = mixture_models_code.em_mgm(points_oldfaith, num_mixture, num_steps, tol, pi=pi, mu=mu, sigma=sigma, seed=seed,
#                                      verbose=True, keep_story=keep_story)
# pickle.dump(res_mgm, open("datasets/res_mgm.p", "wb" ))

# load results obtained with my own EM implementation
res_mgm = pickle.load(open("datasets/res_mgm.p", "rb"))
resp, mu, sigma, pi, logliks, resps, mus, sigmas, pis = res_mgm


plt.plot(logliks, '.-');
plt.xlabel('step')
plt.ylabel('loglikelihood');

#### Visualize dynamics of EM algorithm

In [None]:
tot_steps = len(resps)

fig, axs = plt.subplots(tot_steps, 1)
fig.set_size_inches(10, 3 * tot_steps)
for step, (resp, mu) in enumerate(zip(resps, mus)):
    for k in range(num_mixture):
        axs[step].set_aspect('equal', 'box')
        axs[step].plot(mu[k,0], mu[k,1], 'x', ms=8, color='black')
        axs[step].scatter(points_oldfaith[:,0], points_oldfaith[:,1],
                          alpha=0.1, color=np.vstack([resp[0], np.zeros(resp.shape[1]), resp[1]]).T)
plt.tight_layout();

#### Draw from infered MGM

In [None]:
points, coeff_draw, num_class = draw_mgm(pi, mu, sigma, 20000)

for k in range(3):
    ind = coeff_draw==k
    plt.scatter(points[ind,0], points[ind,1], alpha=0.1);
plt.scatter(points_oldfaith[:,0], points_oldfaith[:,1], color='black', alpha=0.3);

# Variational EM

For complex models, finding a 'tabular' solution **$q(z)=p(z|x,\theta_t)$** in the E step may not always be possible.

In Variational EM we  approximate the responsibilities **$p(z|x,\theta)$** with a variational distribution **$q(z|x,\phi)$**, where:

  - **$q(z|x)=q(z|x,\phi)$** is a parametrized distribution with parameters **$\phi$**.
  - In the E step, **$q(z|x,\phi)$** is optimized not for each **$x$** separately, but for all **$x$** simultaneously.

$$Q_x(\theta,q)=\sum_k q_x(k) \log \frac{p(x,k|\theta)}{q_x(k)} \quad\to \quad Q(\theta,\phi)=\frac{1}{N}\sum_\mu \sum_z q(z|x^\mu,\phi) \log \frac{p(x^\mu,z|\theta)}{q(z|x^\mu,\phi)}$$

## Variational EM

In simple tabular EM, the E step yielded the solution that saturated the bound:

$$q_x^*(k) =p(k|x,\theta_t) \qquad Q(\theta_t,q^*_x) =L_x(\theta_t)$$

In variational EM this is no longer true:

$$q(z|x,\phi^*) \ne p(z|x,\theta_t)\qquad  Q_x(\theta_t,\phi^*) < L(\theta_t)$$

The consequence is that the property that each EM step increases the likelihood is no longer guaranteed (But if not, the bound gets better\!).

Instead, one performs so-called **coordinate ascent** on $Q(\theta,\phi)$, by alternating a gradient step in $\phi$ for fixed $\theta$ (E) and a gradient step for $\theta$ for fixed $\phi$ (M).

### Variational Auto encoder (VAE)

Suppose we have some high-dimensional data **$x$** (images for instance) that we want to compress into latent variables **$z$**. We assume a model **$p(x,z|\theta)=p(z|\theta)p(x|z,\theta)$**. This is a generative model that generates images **$x$**:

$$z \sim p(z|\theta) \qquad x \sim p(x|z,\theta) \qquad \text{(decoder)}$$

The problem is to estimate **$\theta$** such that **$p$** generates images that are similar to a given data set by maximizing the log likelihood **$\sum_\mu \log p(x^\mu|\theta)$**.

<center>
  <img src="figs/vae_schematic.png" width=200>
    &nbsp; &nbsp; &nbsp; &nbsp;
  <img src="figs/autoenc.jpg" width=500>
</center>

### Variational Auto encoder (VAE)

Model the distribution **$p(x)$** of binary MNIST data.

<center><img src="figs/vae_schematic.png" width=200></center>

**Decoder:** Generate $z\sim \mathcal{N}(z|0,1)$ and $p(x|z,\theta)$ a MLP with one hidden layer:

$$p(x|z,\theta)=\prod_{i=1}^{n_x} \mu_i^{x_i}(1-\mu_i)^{1-x_i}\quad \mu_i=\sigma\left(\sum_{j=1}^{n_h} w^{(1)}_{ij}h_j +w^{(1)}_{i0}\right)\quad h_j= \tanh\left(\sum_{k=1}^{n_z} w^{(0)}_{jk}z_k +w^{(0)}_{j0}\right)$$

with **$n_x,n_z$** the dimensions of the **$x,z$** spaces and **$n_h$** the number of hidden units and $\theta={w^{(0)}_{jk},w^{(1)}_{ij}}$.

### Variational Auto encoder (VAE)

Model the distribution **$p(x)$** of continuous Frey face data.

<center><img src="figs/vae_schematic.png" width=200></center>

**Decoder:** Generate $z\sim \mathcal{N}(z|0,1)$ and $p(x|z,\theta)$ two MLPs with one hidden layer:

$$p(x|z,\theta)=\prod_{i=1}^{n_x} \mathcal{N}(x_i|\mu_i,\sigma^2_i)\qquad 
\mu_i=\sum_{j=1}^{n_h} w^{(1)}_{ij} h_j+w^{(1)}_{i0}$$

$$\log \sigma_i^2 = \sum_{j=1}^{n_h} w^{(2)}_{ij}h_j +w^{(2)}_{i0}\qquad h_j= \tanh \left(\sum_{k=1}^{n_z} w^{(0)}_{jk}z_k +w^{(0)}_{j0}\right)$$

with $\theta=\{w_{jk}^{(0)},w_{ij}^{(1)}, w_{ij}^{(2)}\}$.

### Variational Auto encoder (VAE)

**Encoder:**

$$q(z|x,\phi)=\prod_{i=1}^{n_z} \mathcal{N}(z_i|\mu_i,\sigma^2_i)\qquad 
\mu_i=\sum_{j=1}^{n_h} w^{(3)}_{ij} h_j+w^{(3)}_{i0}$$

$$\log \sigma_i^2 = \sum_{j=1}^{n_h} w^{(4)}_{ij} h_j +w^{(4)}_{i0}\qquad 
h_j = \tanh\left(\sum_{k=1}^{n_x} w^{(5)}_{jk} x_k +w^{(5)}_{j0}\right)$$

with $\phi=\{w^{(3)}_{ij},w^{(4)}_{ij},w^{(5)}_{jk}\}$.

### Variational Auto encoder (VAE)

Training with stochastic gradient descent. **Weight decay** (prior on **$\theta$**). Number of hidden units is 500 (MNIST) and 200 (Frey face). Plot of Variational lower bound $Q(\theta,\phi)$ versus training iteration:

<center><img src="figs/vae_learning.png" width=700></center>

Bound improves with larger latent space dimension. No overfitting.

### Variational Auto encoder (VAE)

<center><img src="figs/vae_manifold.png" width=600></center>

Visualization of latent $z$ space in case of $n_z=2$.
$z\sim \mathcal{N}(z|0,1)$ and $x\sim p(x|z,\theta)$ with $\theta$ the parameters after training.

# <center>Assignments</center>

#### Ex 7.1

Write your own K-means algorithm and compare your results with the ones on the Old Faithful dataset in section **K-means: do it yourself**.

#### Ex 7.2

Write a clustering algorithm based on the multinomial mixture model and apply it to the MNIST data. Compare your results with the ones showed in section **Mixture of Bernoullis on MNIST**.

#### Ex 7.4

Consider the one dimensional Gaussian mixture model
$$
p(x,k)=\pi_k \frac{1}{\sqrt{2\pi \sigma_k^2} }\exp\left(-\frac{(x-a_k)^2}{2\sigma_k^2}\right)
$$
with observable data $x^\mu, \mu =1, \dots, N$ and discrete latent variable $k=1,\dots,K$.
Derive an EM algorithm to estimate the parameters $\pi_k,a_k, \sigma_k^2, k=1,\dots,K$ from the data.
Proceed with the following steps.
* Give an expression for the responsabilities $r^\mu_k$ that result from the E step
* Give an expression for the variational bound $Q(\theta,q^*)$ in terms of the responsabilities.
* Show that the M-step can be solved in close form and yields new values of $\pi_k,a_k, \sigma_k^2, k=1,\dots,K$ in terms of the responsabilities and the data. Check that your final result agrees with the multi-dimensional presented in the lecture.

#### Ex 7.4

Implement EM for a Mixture of Gaussian and compare your results with the one in section **Inference in a MGM**.