In [1]:
# Importing Required Libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import silhouette_score, adjusted_rand_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer

In [2]:
# Loading the Wisconsin Breast Cancer Dataset
data = load_breast_cancer()
X = data.data
Y = data.target

# Preprocess Data - Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
# Train the K-means model with default parameters
kmeans_default = KMeans(n_clusters=2, random_state=42)
kmeans_default.fit(X_scaled)

# Predictions
y_pred_default = kmeans_default.predict(X_scaled)

# Evaluate Performance (Silhouette Score, Inertia, and Adjusted Rand Index)
sil_score_default = silhouette_score(X_scaled, y_pred_default)
ari_score_default = adjusted_rand_score(Y, y_pred_default)
inertia_default = kmeans_default.inertia_

# Display Results for Default Model
print("Default K-means Clustering Results:")
print(f"Silhouette Score: {sil_score_default:.4f}")
print(f"Adjusted Rand Index: {ari_score_default:.4f}")
print(f"Inertia: {inertia_default:.4f}")
print("\n")

Default K-means Clustering Results:
Silhouette Score: 0.3447
Adjusted Rand Index: 0.6765
Inertia: 11595.6833






In [4]:
param_grid = {
    'n_clusters': [2, 3, 4, 5, 6],      # Number of clusters
    'init': ['k-means++', 'random'],    # Initialization methods
    'max_iter': [300, 500, 700]         # Maximum iterations
}

# Create KMeans model for GridSearch
kmeans = KMeans(random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(kmeans, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model with hyperparameter tuning
grid_search.fit(X_scaled)

# Best Parameters from Grid Search
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Using the best model from the grid search
best_model = grid_search.best_estimator_

# Predictions with the best model
y_pred_best = best_model.predict(X_scaled)

# Evaluate Performance with Silhouette Score, Inertia, and Adjusted Rand Index
sil_score_best = silhouette_score(X_scaled, y_pred_best)
ari_score_best = adjusted_rand_score(Y, y_pred_best)
inertia_best = best_model.inertia_

# Display Results for Tuned Model
print("Tuned K-means Clustering Results:")
print(f"Silhouette Score: {sil_score_best:.4f}")
print(f"Adjusted Rand Index: {ari_score_best:.4f}")
print(f"Inertia: {inertia_best:.4f}")
print("\n")

# --- 3. Compare Results ---
print("Comparison of Performance Metrics:")
print(f"Silhouette Score (Default vs Tuned): {sil_score_default:.4f} vs {sil_score_best:.4f}")
print(f"Adjusted Rand Index (Default vs Tuned): {ari_score_default:.4f} vs {ari_score_best:.4f}")
print(f"Inertia (Default vs Tuned): {inertia_default:.4f} vs {inertia_best:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'init': 'random', 'max_iter': 300, 'n_clusters': 6}
Tuned K-means Clustering Results:
Silhouette Score: 0.1696
Adjusted Rand Index: 0.3304
Inertia: 7974.6167


Comparison of Performance Metrics:
Silhouette Score (Default vs Tuned): 0.3447 vs 0.1696
Adjusted Rand Index (Default vs Tuned): 0.6765 vs 0.3304
Inertia (Default vs Tuned): 11595.6833 vs 7974.6167




In [10]:
# Summary of both models
print("\nPerformance Comparison:")
print(f"{'Metric':<30} {'Basic Model':<15} {'Tuned Model':<15}")
print(f"{'-'*60}")
print(f"{'Silhouette Score':<30} {sil_score_best:.3f} {' ' * 6} {sil_score_default:.3f}")
print(f"{'Adjusted Rand Index':<30} {ari_score_best:.3f} {' ' * 6} {ari_score_default:.3f}")
print(f"{'Inertia':<30} {inertia_best:.3f} {' ' * 3} {inertia_default:.3f}")


Performance Comparison:
Metric                         Basic Model     Tuned Model    
------------------------------------------------------------
Silhouette Score               0.170        0.345
Adjusted Rand Index            0.330        0.677
Inertia                        7974.617     11595.683


In [11]:
print("Tuned K-means Clustering Results:")
print(f"Silhouette Score: {sil_score_default:.4f}")
print(f"Adjusted Rand Index: {ari_score_default:.4f}")
print(f"Inertia: {inertia_default:.4f}")
print("\n")

Tuned K-means Clustering Results:
Silhouette Score: 0.3447
Adjusted Rand Index: 0.6765
Inertia: 11595.6833




In [13]:
print("Default K-means Clustering Results:")
print(f"Silhouette Score: {sil_score_best:.4f}")
print(f"Adjusted Rand Index: {ari_score_best:.4f}")
print(f"Inertia: {inertia_best:.4f}")
print("\n")

print("Tuned K-means Clustering Results:")
print(f"Silhouette Score: {sil_score_default:.4f}")
print(f"Adjusted Rand Index: {ari_score_default:.4f}")
print(f"Inertia: {inertia_default:.4f}")
print("\n")

# Summary of both models
print("\nPerformance Comparison:")
print(f"{'Metric':<30} {'Basic Model':<15} {'Tuned Model':<15}")
print(f"{'-'*60}")
print(f"{'Silhouette Score':<30} {sil_score_best:.3f} {' ' * 6} {sil_score_default:.3f}")
print(f"{'Adjusted Rand Index':<30} {ari_score_best:.3f} {' ' * 6} {ari_score_default:.3f}")
print(f"{'Inertia':<30} {inertia_best:.3f} {' ' * 3} {inertia_default:.3f}")

Default K-means Clustering Results:
Silhouette Score: 0.1696
Adjusted Rand Index: 0.3304
Inertia: 7974.6167


Tuned K-means Clustering Results:
Silhouette Score: 0.3447
Adjusted Rand Index: 0.6765
Inertia: 11595.6833



Performance Comparison:
Metric                         Basic Model     Tuned Model    
------------------------------------------------------------
Silhouette Score               0.170        0.345
Adjusted Rand Index            0.330        0.677
Inertia                        7974.617     11595.683
