In [5]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

In [8]:
def load_file(filename):
    return np.loadtxt(filename)

def initialize_params(dataset,gaussians):
    datapoints, dimensions = dataset.shape
    minValue = dataset.min(axis = 0)
    maxValue = dataset.max(axis = 0)
    means = np.random.rand(gaussians,dimensions)*(maxValue-minValue) + minValue
    covariances = np.array([np.eye(dimensions) for _ in range(gaussians)])
    weights = np.ones(gaussians) / gaussians
    return means, covariances, weights

def multivariate_normal_pdf(x, mean, cov):
    d = len(mean)
    cov_det = np.linalg.det(cov)  
    cov_inv = np.linalg.inv(cov) 

    norm_factor = (2 * np.pi) ** (d / 2) * np.sqrt(cov_det)
    diff = x - mean
    exponent = -0.5 * np.dot(diff.T, np.dot(cov_inv, diff))

    return np.exp(exponent) / norm_factor


def assigning_points_to_cluster(dataset,means,covariances,weights,gaussians):
    datapoints = dataset.shape[0]
    probabilities = np.zeros((datapoints,gaussians))
    
    for k in range(gaussians):
        for i in range(datapoints):
            probabilities[i, k] = weights[k] * multivariate_normal_pdf(dataset[i], means[k], covariances[k])
    
    probabilities /= probabilities.sum(axis=1, keepdims=True)
    return probabilities

def parameter_update(dataset, probabilities, gaussians):
    n, d = dataset.shape
    weights = probabilities.sum(axis=0) / n
    means = np.dot(probabilities.T, dataset) / probabilities.sum(axis=0)[:, None]
    covariances = np.zeros((gaussians, d, d))
    for k in range(gaussians):
        diff = dataset - means[k]
        covariances[k] = np.dot(probabilities[:, k] * diff.T, diff) / probabilities[:, k].sum()
    return means, covariances, weights

def e_m_min_max(dataset, gaussians, max_iters=100, diff=1e-4):
    means, covariances, weights = initialize_params(dataset, gaussians)
    for i in range(max_iters):
        probabilities = assigning_points_to_cluster(dataset, means, covariances, weights, gaussians)
        new_means, new_covariances, new_weights = parameter_update(dataset, probabilities, gaussians)
        if np.linalg.norm(new_means - means) < diff:
            break
        means, covariances, weights = new_means, new_covariances, new_weights
    return means, covariances, weights


dataset_2_gaussians = load_file('2gaussian.txt')
dataset_3_gaussians = load_file('3gaussian.txt')

In [9]:
means, covariances, weights = e_m_min_max(dataset_2_gaussians, 2)

print("\nEstimated Means:")
print(pd.DataFrame(means, columns=[f"Feature {i+1}" for i in range(means.shape[1])]))

print("\nEstimated Covariances:")
for i, cov in enumerate(covariances):
    print(f"\nCluster {i+1} Covariance Matrix:")
    print(pd.DataFrame(cov, columns=[f"Feature {j+1}" for j in range(cov.shape[1])]))

print("\nEstimated Weights:")
print(pd.DataFrame(weights.reshape(1, -1), columns=[f"Cluster {i+1}" for i in range(len(weights))]))



Estimated Means:
   Feature 1  Feature 2
0   2.994346   3.052071
1   7.013259   3.983198

Estimated Covariances:

Cluster 1 Covariance Matrix:
   Feature 1  Feature 2
0   1.010588   0.027153
1   0.027153   2.937657

Cluster 2 Covariance Matrix:
   Feature 1  Feature 2
0   0.974566   0.497361
1   0.497361   1.001050

Estimated Weights:
   Cluster 1  Cluster 2
0   0.334832   0.665168


In [10]:
means, covariances, weights = e_m_min_max(dataset_3_gaussians, 3)

print("\nEstimated Means:")
print(pd.DataFrame(means, columns=[f"Feature {i+1}" for i in range(means.shape[1])]))

print("\nEstimated Covariances:")
for i, cov in enumerate(covariances):
    print(f"\nCluster {i+1} Covariance Matrix:")
    print(pd.DataFrame(cov, columns=[f"Feature {j+1}" for j in range(cov.shape[1])]))

print("\nEstimated Weights:")
print(pd.DataFrame(weights.reshape(1, -1), columns=[f"Cluster {i+1}" for i in range(len(weights))]))


Estimated Means:
   Feature 1  Feature 2
0   3.039438   3.047763
1   5.011599   7.001357
2   7.021508   4.015432

Estimated Covariances:

Cluster 1 Covariance Matrix:
   Feature 1  Feature 2
0   1.028306   0.026349
1   0.026349   3.383231

Cluster 2 Covariance Matrix:
   Feature 1  Feature 2
0   0.979872   0.185257
1   0.185257   0.974699

Cluster 3 Covariance Matrix:
   Feature 1  Feature 2
0   0.990504   0.500994
1   0.500994   0.995653

Estimated Weights:
   Cluster 1  Cluster 2  Cluster 3
0   0.205537    0.49602   0.298444
