## Evaluating Clustering Algorithms - Intrinsic Metrics

### Silhouette Score

In [1]:
import numpy as np

# Define the points in each cluster
cluster_1 = np.array([(2, 5), (3, 4), (4, 6)])
cluster_2 = np.array([(8, 3), (9, 2), (10, 5)])
cluster_3 = np.array([(6, 10), (7, 8), (8, 9)])

# Combine all points into one dataset
all_points = np.vstack((cluster_1, cluster_2, cluster_3))

# Function to calculate Euclidean distance
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

# Function to calculate the silhouette score for each point
def silhouette_score(all_points, labels):
    scores = []
    n = len(all_points)

    for i in range(n):
        # Find the current point
        current_point = all_points[i]
        
        # Find the label of the current point
        current_label = labels[i]
        
        # Calculate a (average distance to the same cluster)
        same_cluster_distances = [
            euclidean_distance(current_point, all_points[j])
            for j in range(n) if labels[j] == current_label and j != i
        ]
        a = np.mean(same_cluster_distances) if same_cluster_distances else 0
        
        # Calculate b (minimum average distance to other clusters)
        other_cluster_distances = []
        for label in set(labels):
            if label != current_label:
                distances = [
                    euclidean_distance(current_point, all_points[j])
                    for j in range(n) if labels[j] == label
                ]
                if distances:
                    other_cluster_distances.append(np.mean(distances))

        b = min(other_cluster_distances) if other_cluster_distances else 0
        
        # Calculate the silhouette score for this point
        score = (b - a) / max(a, b) if max(a, b) != 0 else 0
        scores.append(score)

    return np.array(scores)

# Labels for the points in all_points
labels = np.array([1]*3 + [2]*3 + [3]*3)  # Cluster 1: label 1, Cluster 2: label 2, Cluster 3: label 3

# Calculate the silhouette scores
scores = silhouette_score(all_points, labels)

# Print the silhouette scores for each point and the average score
for i, score in enumerate(scores):
    print(f"Point {i + 1}: Silhouette Score = {score:.4f}")

average_score = np.mean(scores)
print(f"\nAverage Silhouette Score: {average_score:.4f}")


Point 1: Silhouette Score = 0.7184
Point 2: Silhouette Score = 0.7039
Point 3: Silhouette Score = 0.4870
Point 4: Silhouette Score = 0.6125
Point 5: Silhouette Score = 0.6626
Point 6: Silhouette Score = 0.4056
Point 7: Silhouette Score = 0.6185
Point 8: Silhouette Score = 0.6372
Point 9: Silhouette Score = 0.6879

Average Silhouette Score: 0.6149


### Davies-Bouldin Index

In [8]:
import numpy as np

# Function to calculate Euclidean distance
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

# Function to calculate the centroid of a cluster
def calculate_centroid(cluster):
    return np.mean(cluster, axis=0)

# Function to calculate the intra-cluster distance for each cluster
def calculate_intra_distance(cluster, centroid):
    return np.mean([euclidean_distance(point, centroid) for point in cluster])

# Function to calculate the Davies-Bouldin Index
def davies_bouldin_index(clusters):
    n_clusters = len(clusters)
    centroids = [calculate_centroid(cluster) for cluster in clusters]
    
    # Calculate intra-cluster distances
    S = [calculate_intra_distance(clusters[i], centroids[i]) for i in range(n_clusters)]

    # Calculate the Davies-Bouldin Index
    DBI = 0
    for i in range(n_clusters):
        max_ratio = 0
        for j in range(n_clusters):
            if i != j:
                M_ij = euclidean_distance(centroids[i], centroids[j])
                ratio = (S[i] + S[j]) / M_ij
                max_ratio = max(max_ratio, ratio)
        print(i+1, max_ratio)
        DBI += max_ratio
    
    # Return average DBI
    return DBI / n_clusters

# Example data: a list of clusters with points
clusters = [
    np.array([[2, 5], [3, 4], [4, 6]]),  # Cluster 1
    np.array([[8, 3], [9, 2], [10, 5]]),  # Cluster 2
    np.array([[6, 10], [7, 8], [8, 9]])   # Cluster 3
]

# Calculate Davies-Bouldin Index
db_index = davies_bouldin_index(clusters)

print(f"Davies-Bouldin Index: {db_index:.4f}")


1 0.41459587266912035
2 0.4296313927591706
3 0.4296313927591706
Davies-Bouldin Index: 0.4246


### Within-Cluster Sum of Squares

In [9]:
import numpy as np

# Function to calculate centroid of a cluster
def calculate_centroid(cluster):
    return np.mean(cluster, axis=0)

# Function to calculate WCSS for a cluster
def calculate_wcss(cluster, centroid):
    return np.sum(np.linalg.norm(cluster - centroid, axis=1) ** 2)

# Function to calculate WCSS for multiple clusters
def calculate_wcss_for_clusters(clusters):
    total_wcss = 0
    wcss_per_cluster = {}
    
    # Iterate over each cluster
    for i, cluster in enumerate(clusters):
        centroid = calculate_centroid(cluster)
        wcss = calculate_wcss(cluster, centroid)
        wcss_per_cluster[f"Cluster {i + 1}"] = wcss
        total_wcss += wcss  # Accumulate total WCSS
    
    return wcss_per_cluster, total_wcss

# Example usage
# Define multiple clusters (can be any number of clusters)
clusters = [
    np.array([[2, 5], [3, 4], [4, 6]]),  # Cluster 1
    np.array([[8, 3], [9, 2], [10, 5]]),  # Cluster 2
    np.array([[6, 10], [7, 8], [8, 9]]),  # Cluster 3
]

# Calculate WCSS for all clusters
wcss_per_cluster, total_wcss = calculate_wcss_for_clusters(clusters)

# Print the WCSS results
for cluster, wcss in wcss_per_cluster.items():
    print(f"{cluster} WCSS: {wcss:.4f}")

print(f"Total WCSS: {total_wcss:.4f}")


Cluster 1 WCSS: 4.0000
Cluster 2 WCSS: 6.6667
Cluster 3 WCSS: 4.0000
Total WCSS: 14.6667


### Dunn's Index

In [10]:
import numpy as np
from itertools import combinations

def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2)

def cluster_diameter(cluster):
    """Calculate the diameter of a cluster."""
    max_distance = 0
    for point1, point2 in combinations(cluster, 2):
        distance = euclidean_distance(point1, point2)
        if distance > max_distance:
            max_distance = distance
    return max_distance

def min_inter_cluster_distance(clusters):
    """Calculate minimum distance between clusters."""
    min_distance = float('inf')
    for (i, cluster_a), (j, cluster_b) in combinations(enumerate(clusters), 2):
        for point_a in cluster_a:
            for point_b in cluster_b:
                distance = euclidean_distance(point_a, point_b)
                if distance < min_distance:
                    min_distance = distance
    return min_distance

def dunn_index(clusters):
    """Calculate Dunn's index for the given clusters."""
    min_distance = min_inter_cluster_distance(clusters)
    max_diameter = max(cluster_diameter(cluster) for cluster in clusters)
    return min_distance / max_diameter

# Define clusters as numpy arrays
clusters = [
    np.array([[2, 5], [3, 4], [4, 6]]),  # Cluster 1
    np.array([[8, 3], [9, 2], [10, 5]]),  # Cluster 2
    np.array([[6, 10], [7, 8], [8, 9]])   # Cluster 3
]

# Calculate Dunn's index
dunn_index_value = dunn_index(clusters)
print(f"Dunn's Index: {dunn_index_value:.4f}")

Dunn's Index: 1.1402
