<a href="https://colab.research.google.com/github/ishvin712/Deepseek_Clustering/blob/main/Deepseek_clustering_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

In [None]:
# Preprocessing methods
preprocess_methods = {
    'No Data Processing': X,
    'Using Normalization': StandardScaler().fit_transform(X),
    'Using PCA': PCA(n_components=2).fit_transform(X),
    'Using T+N': PCA(n_components=2).fit_transform(StandardScaler().fit_transform(X)),
}


In [None]:

# Clustering algorithms and parameters
algorithms = {
    'K-Means': KMeans,
    'Hierarchical': AgglomerativeClustering,
}

n_clusters_list = [3, 4, 5]


In [None]:

# Evaluate clustering performance
results = []
for preprocess_name, X_processed in preprocess_methods.items():
    for algo_name, algo in algorithms.items():
        for c in n_clusters_list:
            model = algo(n_clusters=c)
            labels = model.fit_predict(X_processed)
            if len(np.unique(labels)) < 2:
                scores = {'Silhouette': np.nan, 'Calinski-Harabasz': np.nan, 'Davies-Bouldins': np.nan}
            else:
                scores = {
                    'Silhouette': silhouette_score(X_processed, labels),
                    'Calinski-Harabasz': calinski_harabasz_score(X_processed, labels),
                    'Davies-Bouldins': davies_bouldin_score(X_processed, labels),
                }
            results.append({
                'Algorithm': algo_name,
                'Preprocessing': preprocess_name,
                'Clusters': c,
                **scores
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [None]:

# Generate pivot tables
silhouette_table = results_df.pivot_table(index=['Algorithm', 'Preprocessing'], columns='Clusters', values='Silhouette')
calinski_table = results_df.pivot_table(index=['Algorithm', 'Preprocessing'], columns='Clusters', values='Calinski-Harabasz')
davies_table = results_df.pivot_table(index=['Algorithm', 'Preprocessing'], columns='Clusters', values='Davies-Bouldins')

In [None]:

# Display results
print("Silhouette Scores:\n", silhouette_table)
print("\nCalinski-Harabasz Scores:\n", calinski_table)
print("\nDavies-Bouldin Scores:\n", davies_table)


Silhouette Scores:
 Clusters                                 3         4         5
Algorithm    Preprocessing                                    
Hierarchical No Data Processing   0.554324  0.488967  0.484383
             Using Normalization  0.446689  0.400636  0.330587
             Using PCA            0.598475  0.540977  0.548784
             Using T+N            0.511060  0.448735  0.404169
K-Means      No Data Processing   0.552819  0.496251  0.450033
             Using Normalization  0.456535  0.400798  0.341947
             Using PCA            0.597676  0.559111  0.508971
             Using T+N            0.505196  0.474424  0.412758

Calinski-Harabasz Scores:
 Clusters                                   3           4           5
Algorithm    Preprocessing                                          
Hierarchical No Data Processing   558.058041  515.078906  488.484904
             Using Normalization  222.719164  201.251454  192.681283
             Using PCA            688.617548  

In [None]:

# Best configuration
best_row = results_df.loc[results_df['Silhouette'].idxmax()]
print("\nBest Configuration:\n", best_row)


Best Configuration:
 Algorithm            Hierarchical
Preprocessing           Using PCA
Clusters                        3
Silhouette               0.598475
Calinski-Harabasz      688.617548
Davies-Bouldins          0.560496
Name: 15, dtype: object
