In [None]:
# Load a dataset and visualize its distribution to understand how clustering works.

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# Generate synthetic data with 3 clusters
X, y = make_blobs(n_samples=300, centers=3, cluster_std=1.0, random_state=42)

# Plot the data points
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], s=50, alpha=0.7)
plt.title("Data for Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
# Use K-Means to group data points into clusters based on centroids.

from sklearn.cluster import KMeans

# Initialize and fit K-Means model
kmeans = KMeans(n_clusters=3, random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Plot K-Means clustering results
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', s=50, alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.title("K-Means Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.show()

In [None]:
# Use the Elbow Method to determine the optimal number of clusters for K-Means.

inertia = []
K_range = range(1, 10)

# Calculate inertia for each K value
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the Elbow curve
plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.show()

In [None]:
# Use DBSCAN to group data points based on density, capturing non-linear cluster shapes and identifying noise.

from sklearn.cluster import DBSCAN

# Initialize and fit DBSCAN model
dbscan = DBSCAN(eps=0.5, min_samples=5)
y_dbscan = dbscan.fit_predict(X)

# Plot DBSCAN clustering results
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_dbscan, cmap='plasma', s=50, alpha=0.7)
plt.title("DBSCAN Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
# Compare the performance of K-Means and DBSCAN for different clustering scenarios.

print("K-Means Summary:")
print(f"Number of Clusters: {len(set(y_kmeans)) - (1 if -1 in y_kmeans else 0)}")
print("\nDBSCAN Summary:")
print(f"Number of Clusters (including noise): {len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)}")
print(f"Noise Points: {sum(y_dbscan == -1)}")

# K-Means performs well for spherical clusters but struggles with non-linear shapes.
# DBSCAN excels at non-linear clusters and detects outliers but requires tuning (eps and min_samples).