In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({
    'Var': [-3.3, -4.4, -1.9, 3.3, 2.5, 3.2, 0.3, 0.1, -0.1, -0.5],
    'partition1': [1, 1, 1, 2, 2, 2, 2, 2, 1, 1],
    'partition2': [1, 3, 2, 1, 3, 2, 1, 3, 2, 1]
})

data_T = data.T

Question 1 : algorithm init

In [3]:
def initEM( x , partition):
    '''return ((π0_K, θ = (μ0_K, var0_K))'''
    x = np.array(x)
    partition = np.array(partition)
    params ={'pi' : [], 'theta':{'mu': [], 'sigma2':[]}}

    K = np.unique(partition)
    for k in K:
        cluster = [x[i] for i in range(len(x)) if partition[i] == k] 

        pi = len(cluster) / len(x)
        mu , sigma2 = np.mean(np.array(cluster)), np.var(np.array(cluster))
        params['pi'].append(pi)
        params['theta']['mu'].append(mu)
        params['theta']['sigma2'].append(sigma2)
    

    return params
    


In [6]:
param = initEM(data['Var'], data['partition2'])
param


{'pi': [0.4, 0.3, 0.3],
 'theta': {'mu': [-0.05, 0.4000000000000001, -0.6000000000000001],
  'sigma2': [5.527499999999999, 4.46, 8.180000000000001]}}

Question 2 : Etape E (guess)

In [5]:
from scipy.stats import lognorm

def Estep(x, params):

    #calculer posterior P_theta(Z_ik = 1|X) 
    pi = params['pi']
    mu = params['theta']['mu']
    sigma2 = params['theta']['sigma2']
    K = len(params['pi']) 
    n=len(x)
    tau = np.zeros((n,K))
    log_tau = np.zeros((n,K))
    for k in range(K):

        log_tau[:, k] = np.log(pi[k]) +  lognorm.pdf(x, s=sigma2[k], scale=np.exp(mu[k]))
    


    #LogSumExptrick cte
    m = np.max(log_tau, axis=1, keepdims=True)
    # print(len(m))
    cte = m + np.log(np.sum(np.exp(log_tau - m), axis=1, keepdims=True))
    # print(len(cte))
    log_tau = log_tau - cte 
    tau = np.exp(log_tau)  
  
    return tau   
    


In [161]:
tau = Estep(data['Var'], initEM(data['Var'], data['partition2']))
tau

array([[0.4       , 0.3       , 0.3       ],
       [0.4       , 0.3       , 0.3       ],
       [0.4       , 0.3       , 0.3       ],
       [0.4001804 , 0.30174701, 0.29807259],
       [0.40024965, 0.30232881, 0.29742154],
       [0.40018697, 0.30180426, 0.29800877],
       [0.40309148, 0.31594836, 0.28096016],
       [0.41071986, 0.3337723 , 0.25550784],
       [0.4       , 0.3       , 0.3       ],
       [0.4       , 0.3       , 0.3       ]])

Question 3 : Etape M (check)

In [162]:
def compute_PI(tau): # pi_k is mean of all posteriori probs of each obs in cluster k
    # print(np.shape(tau))

    # mean column wise of tau ( or log_tau)!
    PI = np.mean(np.array(tau), axis=0)
    return PI
    

def compute_mu(x,tau):
    tau = np.array(tau)
    x = np.array(x)
    
    K = (tau.shape[1])
    mu = [(np.sum(tau[:, k][:, None] * x, axis = 0) / np.sum(tau[:, k])) for k in range(K) ]
    return mu 

def compute_sigma2(tau, mu, x):
    tau = np.array(tau)
    x = np.array(x)
    sigma2 = []

    K = (tau.shape[1])
    # sigma2 = [(np.sum((tau[:, k] * (( np.transpose(x) - mu[k])**2)), axis = 1) / np.sum(tau[:, k])) for k in range(K) ]

    for k in range(K):
        diff = x - mu[k]  # mean to each row of x
        diff2 = diff ** 2
        # weighted_diff2 = tau[:, k] * diff2
        weighted_diff2 = tau[:, k][:, None] * diff2
        cluster_sigma2 = np.sum(weighted_diff2, axis=0) / np.sum(tau[:, k])
        # sigma2.append(np.mean(cluster_sigma2)) 
        sigma2.append(cluster_sigma2.mean())

    return sigma2 
    

def Mstep(x, tau):
    #update mixture weights 
    PI_t = compute_PI(tau)
    #update means
    mu_t = compute_mu(x,tau)
    #update vars
    sigma2_t = compute_sigma2(tau, mu_t, x)
    return PI_t, mu_t, sigma2_t
    

In [163]:
compute_mu(data['Var'],tau) 

[array([-3.3, -4.4, -1.9,  3.3,  2.5,  3.2,  0.3,  0.1, -0.1, -0.5]),
 array([-3.3, -4.4, -1.9,  3.3,  2.5,  3.2,  0.3,  0.1, -0.1, -0.5]),
 array([-3.3, -4.4, -1.9,  3.3,  2.5,  3.2,  0.3,  0.1, -0.1, -0.5])]

question 4:

In [164]:
from scipy.stats import norm

def compute_mixture_llhood(x,param):
    pi = np.array(param['pi'])  #  (K,)
    mu = np.array(param['theta']['mu'])  #(K,)
    sigma2 = np.sqrt(np.array(param['theta']['sigma2']))  #(K,)
    
    #contribution de chaque k dans chaque x_i 
    densities = np.array([norm.pdf(x, loc=mu_k, scale=sigma2_k) for mu_k, sigma2_k in zip(mu, sigma2)]).T  #(n, K) 
    weighted_densities = densities * pi 
    marginal_likelihood = np.sum(np.log(np.sum(weighted_densities, axis=1)))
    return marginal_likelihood



In [165]:

log_likelihood = compute_mixture_llhood(data['Var'], param)
log_likelihood


-23.314947226151602

ELBO question :
$$
\mathcal{L}(q, \theta) = \mathbb{E}_q \left[ \log p_\theta(X, Z) \right] + \mathcal{H}(q)
$$

-> eval expectation term : 
$$
\mathbb{E}_q \left[ \log p_\theta(X, Z) \right] = \sum_{i=1}^n \sum_{k=1}^K q(Z_i = k) \left( \log \pi_k + \log \mathcal{N}(x_i \mid \mu_k, \sigma_k^2) \right)
$$
-> eval entropy term : 
$$
\mathcal{H}(q) = - \sum_{i=1}^n \sum_{k=1}^K q(Z_i = k) \log q(Z_i = k)
$$

In [166]:
pi = param['pi']
mu = param['theta']['mu'] 
sigma2 = param['theta']['sigma2'] 
X = data['Var']
n = len(data['Var']) 
K = len(pi)

eq_log_joint = 0.0
for i in range(n):
    for k in range(K):
        log_pi_k = np.log(pi[k]) 

        log_normale_k = -0.5 * np.log(2 * np.pi * sigma2[k]) - ((X[i] - mu[k])**2) / (2 * sigma2[k])

        eq_log_joint += tau[i, k] * (log_pi_k + log_normale_k)

entropy = 0.0
for i in range(n):
    for k in range(K):
        if tau[i, k] > 0:  
            entropy -= tau[i, k] * np.log(tau[i, k])

elbo = eq_log_joint + entropy

elbo , log_likelihood

(-23.540883360976366, -23.314947226151602)

almost??

Question 5:

In [None]:
def EMgauss1D(X, K, partition_init, max_iter, rtol):

    #init 
    n = len(X)  
    param = initEM( X , partition_init)

    logliks = []  
    loglik_t_1 = None  # prev log-likelihood for convergence check


    for t in range(max_iter):
        #E
        tau = Estep(X, param) 

        # M
        param['pi'], param['theta']['mu'], param['theta']['sigma2'] = Mstep(X, tau)

        loglik_t = compute_mixture_llhood(X, param)
        logliks.append(loglik_t)

        if loglik_t_1 is not None:
            rel_diff = np.abs((loglik_t - loglik_t_1) / loglik_t_1)
            if rel_diff < rtol:
                print(f"Converged at iteration {t}")
                break

        loglik_t_1 = loglik_t  



    return logliks, param, tau

In [172]:

logliks, param, tau =EMgauss1D(X, 2 ,data['partition1'], 20, 1e-6)

Converged at iteration 7


In [180]:
df_logliks = pd.DataFrame({
        'Iteration': range(1, len(logliks) + 1),
        'Log-Likelihood': logliks
    })
df_params = pd.DataFrame({
    'Cluster': range(1, len(param['pi']) + 1),
    'Pi': param['pi'],
    'Mu': param['theta']['mu'],
    'Sigma^2': param['theta']['sigma2']
})

df_tau = pd.DataFrame(tau, columns=[f'Cluster_{k+1}' for k in range(tau.shape[1])])



In [181]:
df_logliks

Unnamed: 0,Iteration,Log-Likelihood
0,1,355.453724
1,2,336.810197
2,3,338.319962
3,4,339.162459
4,5,338.903708
5,6,343.212446
6,7,347.718631
7,8,347.718631
