# EM for Binary Data

In [1]:
import os
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=6, linewidth=100)

In [2]:
dataset = np.loadtxt('binarydigits.txt')
N, D = dataset.shape

N, D

(100, 64)

We have $N = 100$ images, each with $D = 64$ pixels laid out as a vector, i.e., $\mathbf{x}^{\left(n\right)} \in \mathbb{R}^D,\ \forall n\in \left\{1,\ldots,N\right\}$. 

## 3 (d)

In [3]:
def smoothing(p, eps=1e-8):

    nonzero = np.nonzero(p)
    if nonzero[0].size == 0:
        return p
    p[nonzero] -= eps * p.size/nonzero[0].size
    p += eps

    return p


class EMForMultivariateBernoulli:

    def log_likelihood(self, X, P, pi):

        N, D, K = *X.shape, pi.size

        log_likelihood = 0
        for n in range(N):
            x_likelihood = 0
            for k in range(K):
                component_joint = pi[k]
                for d in range(D):
                    component_joint *= (P[k, d]**X[n, d] * (1-P[k, d])**(1-X[n, d]))
                x_likelihood += component_joint
            log_likelihood += np.log(x_likelihood+1e-8)

        return log_likelihood

    def expectation(self, X, P, pi):

        N, D, K = *X.shape, pi.size

        P = 1 - smoothing(1-P, eps=1e-8)
        R = np.zeros(shape=(N, K))
        for n in range(N):
            for k in range(K):
                R[n, k] = pi[k]
                for d in range(D):
                    R[n, k] *= (P[k, d]**X[n, d] * (1-P[k, d])**(1-X[n, d]))
            R[n] /= np.sum(R[n])
        
        return R

    def maximisation(self, X, R):

        P = np.divide(R.T @ X, smoothing(p=R.sum(axis=0), eps=1e-8).reshape(-1, 1))
        pi = np.mean(R, axis=0)

        return P, pi

    def __call__(self, K, X, n_iterations, eps):

        P = np.random.uniform(size=(K, X.shape[1]))
        pi = np.ones(K) / K

        log_ls = [self.log_likelihood(X=X, P=P, pi=pi)]

        for i in range(n_iterations):
            R = self.expectation(X=X, P=P, pi=pi)
            P, pi = self.maximisation(X=X, R=R)
            log_l = self.log_likelihood(X=X, P=P, pi=pi)
            log_ls.append(log_l)
            if log_ls[-1] - log_ls[-2] <= eps:
                break

        return P, pi, R, log_ls

In [4]:
def plot_params(params, k, pi_k, fn):
    fig, axs = plt.subplots()
    axs.imshow(params)
    for i in range(8):
        for j in range(8):
            axs.text(j, i, f'{params[i, j]:.3f}', 
                ha='center', va='center', color='black')
    axs.set_xticks(np.arange(8)); axs.set_yticks(np.arange(8))
    axs.set_title(fr'$\pi_{({k+1})} = {pi_k:.6f}$')
    fig.tight_layout()
    if not os.path.exists(os.path.dirname(fn)):
        os.makedirs(os.path.dirname(fn), exist_ok=True)
    plt.savefig(fn)
    plt.close(fig)

In [5]:
Ks = (2, 3, 4, 7, 10)
log_likelihoods_lst = list()
for K in tqdm(Ks):
    em = EMForMultivariateBernoulli()
    P, pi, R, log_likelihoods = em(K=K, X=dataset, n_iterations=20, eps=1e-6)
    log_likelihoods_lst.append(log_likelihoods)
    for k in range(K):
        plot_params(params=P[k].reshape(8, 8), k=k, pi_k=pi[k], fn=f'assets/em/K={K}/k={k+1}.png')

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:21<00:00,  4.31s/it]


In [14]:
fig, axs = plt.subplots(1, 1, figsize=(6, 4))
for i, (K, log_ls) in enumerate(zip(Ks, log_likelihoods_lst)):
    axs.plot(log_ls, marker='o', color=f'C{i}', label=f'K = {K}')
axs.set_xlabel('Iteration Number')
axs.set_ylabel('Log-likelihood')
axs.grid()
axs.legend()
fig.tight_layout()
plt.savefig(f'assets/em/log_ls.png')
plt.close(fig)