In [3]:
# 📦 Imports
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# 📥 Load Dataset
iris = load_iris()
X = iris.data
y = iris.target

# ⚙️ Preprocess Variants
scaler_std = StandardScaler()
scaler_minmax = MinMaxScaler()

X_std = scaler_std.fit_transform(X)
X_norm = scaler_minmax.fit_transform(X)
X_pca = PCA(n_components=2).fit_transform(X_std)
X_norm_pca = PCA(n_components=2).fit_transform(X_norm)

# 🧪 Evaluation Function for Clustering
def evaluate_clustering(model_type, X_data, label):
    results = {
        'Clustering': [],
        'Preprocessing': [],
        'Clusters': [],
        'Silhouette': [],
        'Calinski-Harabasz': [],
        'Davies-Bouldin': []
    }

    cluster_range = [3, 4, 5] if model_type != "MeanShift" else [None]

    for c in cluster_range:
        if model_type == "KMeans":
            model = KMeans(n_clusters=c, random_state=42, n_init='auto')
        elif model_type == "Agglomerative":
            model = AgglomerativeClustering(n_clusters=c)
        elif model_type == "MeanShift":
            model = MeanShift()
        else:
            continue

        labels = model.fit_predict(X_data)
        if len(np.unique(labels)) < 2:
            continue  # skip invalid cluster results

        sil = silhouette_score(X_data, labels)
        ch = calinski_harabasz_score(X_data, labels)
        db = davies_bouldin_score(X_data, labels)

        results['Clustering'].append(model_type)
        results['Preprocessing'].append(label)
        results['Clusters'].append(f"c={c}" if c else f"c={len(np.unique(labels))}")
        results['Silhouette'].append(round(sil, 3))
        results['Calinski-Harabasz'].append(round(ch, 1))
        results['Davies-Bouldin'].append(round(db, 3))

    return pd.DataFrame(results)

# 🧪 Run All Combinations
datasets = {
    'Standardized': X_std,
    'Normalized': X_norm,
    'PCA': X_pca,
    'Normalized + PCA': X_norm_pca
}

all_results = []

for name, data in datasets.items():
    for model in ["KMeans", "Agglomerative", "MeanShift"]:
        df = evaluate_clustering(model, data, name)
        all_results.append(df)

# 📋 Final Results Table
final_df = pd.concat(all_results, ignore_index=True)
final_df


Unnamed: 0,Clustering,Preprocessing,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,KMeans,Standardized,c=3,0.48,157.4,0.789
1,KMeans,Standardized,c=4,0.385,206.7,0.87
2,KMeans,Standardized,c=5,0.345,202.6,0.944
3,Agglomerative,Standardized,c=3,0.447,222.7,0.803
4,Agglomerative,Standardized,c=4,0.401,201.3,0.979
5,Agglomerative,Standardized,c=5,0.331,192.7,0.974
6,MeanShift,Standardized,c=2,0.582,251.3,0.593
7,KMeans,Normalized,c=3,0.483,351.3,0.787
8,KMeans,Normalized,c=4,0.444,313.9,0.908
9,KMeans,Normalized,c=5,0.423,263.2,0.993
