<a href="https://colab.research.google.com/github/jatin1bagga/CLUSTERING_1/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load dataset
data = load_iris()
X = data.data

# Evaluation function
def evaluate_clustering(X, labels):
    return {
        "Silhouette": silhouette_score(X, labels),
        "Calinski-Harabasz": calinski_harabasz_score(X, labels),
        "Davies-Bouldin": davies_bouldin_score(X, labels)
    }

# Preprocessing function
def preprocess_data(X, method):
    if method == "none":
        return X
    elif method == "normalize":
        return MinMaxScaler().fit_transform(X)
    elif method == "standardize":
        return StandardScaler().fit_transform(X)
    elif method == "pca":
        return PCA(n_components=2).fit_transform(X)
    elif method == "t+n":
        X_std = StandardScaler().fit_transform(X)
        return MinMaxScaler().fit_transform(X_std)
    elif method == "t+n+pca":
        X_std = StandardScaler().fit_transform(X)
        X_norm = MinMaxScaler().fit_transform(X_std)
        return PCA(n_components=2).fit_transform(X_norm)

# Configuration
cluster_range = [3, 4, 5]
preprocess_methods = ["none", "normalize", "standardize", "pca", "t+n", "t+n+pca"]
algorithms = ["kmeans", "hierarchical", "meanshift"]
results = []

# Run clustering for each setting
for algo in algorithms:
    for method in preprocess_methods:
        X_proc = preprocess_data(X, method)
        for k in cluster_range:
            try:
                if algo == "kmeans":
                    model = KMeans(n_clusters=k, random_state=0)
                elif algo == "hierarchical":
                    model = AgglomerativeClustering(n_clusters=k)
                elif algo == "meanshift":
                    if k != 3:
                        continue  # MeanShift doesn't take k, we just run once
                    bandwidth = estimate_bandwidth(X_proc, quantile=0.2, n_samples=50)
                    model = MeanShift(bandwidth=bandwidth)

                labels = model.fit_predict(X_proc)
                scores = evaluate_clustering(X_proc, labels)
                results.append({
                    "Algorithm": algo,
                    "Preprocessing": method,
                    "Clusters": k,
                    **scores
                })
            except Exception as e:
                print(f"Error with {algo}, {method}, k={k}: {e}")

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.round(3)  # Round for clean table view
results_df


Unnamed: 0,Algorithm,Preprocessing,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,kmeans,none,3,0.551,561.594,0.666
1,kmeans,none,4,0.498,530.766,0.78
2,kmeans,none,5,0.461,459.451,0.915
3,kmeans,normalize,3,0.483,351.295,0.787
4,kmeans,normalize,4,0.445,314.473,0.9
5,kmeans,normalize,5,0.436,269.943,0.932
6,kmeans,standardize,3,0.46,241.904,0.834
7,kmeans,standardize,4,0.387,207.266,0.87
8,kmeans,standardize,5,0.346,203.267,0.945
9,kmeans,pca,3,0.598,693.708,0.565
