<a href="https://colab.research.google.com/github/jiya2107/Machine-Learning---Practical/blob/main/ML_Experiment_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load and preprocess data (X only)
try:
    df = pd.read_csv('HepatitisCdata.csv', index_col=0)
except FileNotFoundError:
    print("Please ensure 'HepatitisCdata.csv' is uploaded.")
    exit()

df = df.replace('?', np.nan)
for col in ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']:
    df[col] = pd.to_numeric(df[col])
df = df.dropna()
df['Sex'] = df['Sex'].map({'m': 1, 'f': 0})
df['Category'] = df['Category'].str.split('=').str[0].astype(int) # Keep for comparison

X = df.drop('Category', axis=1)
y_true = df['Category']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- EM Algorithm (via GaussianMixture Model) ---
# We know there are 5 classes (0 to 4) in the target, so we set n_components=5
n_components = 5
gmm = GaussianMixture(n_components=n_components, random_state=42, max_iter=100)
gmm.fit(X_scaled)

# Get cluster assignments
gmm_clusters = gmm.predict(X_scaled)

# Get responsibilities (probability of each data point belonging to each component)
responsibilities = gmm.predict_proba(X_scaled)

print("--- Expectation-Maximization (Gaussian Mixture Model) Results ---")
print(f"Number of Iterations to Converge: {gmm.n_iter_}")
print(f"Final Log-Likelihood (Lower is better, but higher is usually better in EM): {gmm.lower_bound_:.4f}")

print("\nModel Parameters (Estimated by EM):")
print(f"Means of the {n_components} Gaussian Components (First 5 features):\n{gmm.means_[:, :5].round(2)}")
print(f"Weights (Priors) of the Components: {gmm.weights_.round(2)}")

print("\nCluster Assignment Comparison (True Categories vs GMM Clusters):")
print("Note: Cluster labels are arbitrary and may not map directly to true categories.")
print(pd.crosstab(y_true, gmm_clusters, rownames=['True Category'], colnames=['GMM Cluster']))

print(f"\nSample Responsibilities (First 3 samples): \n{responsibilities[:3].round(4)}")