In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random

In [None]:
from birch import birch_cluster, k_means, CFLeaf, CFBranch, collect_leaf_centroids

In [None]:
dataset = pd.read_excel('sample_data.xlsx', header=None) # This is a sample dataset
dataset.columns = ['Feature 1', 'Feature 2']
dataset.head()

In [None]:
dataset.describe()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(dataset['Feature 1'], dataset['Feature 2'], c='blue', label='Data Points')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title('Scatter Plot of Data')
plt.legend()
plt.axis('equal')
plt.show()

In [None]:
data = list(dataset.itertuples(index=False, name=None))

In [None]:
branching_factor = 2
threshold = 1.5

In [None]:
cf_tree = birch_cluster(data, branching_factor, threshold)

In [None]:
# CF Tree Visualization: Hierarchical Display of Branches and Leaves with Attributes
def print_cf_tree(node, level=0):
    indent = "  " * level
    if isinstance(node, CFLeaf):
        print(f"{indent}Leaf: points={node.points}, radius={node.radius}")
    elif isinstance(node, CFBranch):
        print(f"{indent}Branch:")
        for child in node.children:
            print_cf_tree(child, level + 1)

In [None]:
# Print the CF tree structure
print_cf_tree(cf_tree)

In [None]:
# Tree Traversal and Cluster Information Printing: Displaying BIRCH Clustering Results
def traverse_tree(node, cluster_number=1):
    if isinstance(node, CFLeaf):
        print(f"Cluster {cluster_number}:")
        print("Points:", node.points)
        print("Cluster Features (N, LS, SS):", node.cluster_features.N, node.cluster_features.LS, node.cluster_features.SS)
        print("Radius:", node.radius)
        print()
        return cluster_number + 1
    else:
        for child in node.children:
            cluster_number = traverse_tree(child, cluster_number)
        return cluster_number

In [None]:
# Analyze the Birch clustering result
traverse_tree(cf_tree)

In [None]:
def plot_clusters(root):
    cluster_colors = {}
    cluster_number = 0

    def assign_colors(node):
        nonlocal cluster_number
        if isinstance(node, CFLeaf):
            cluster_colors[cluster_number] = node.points
            cluster_number += 1
        else:
            for child in node.children:
                assign_colors(child)

    # Assign colors to each cluster
    assign_colors(root)
    plt.figure(figsize=(12, 10))
    # Plot each cluster with a unique color
    for cluster, points in cluster_colors.items():
        x_vals = [point[0] for point in points]
        y_vals = [point[1] for point in points]
        plt.scatter(x_vals, y_vals, label=f"Cluster {cluster + 1}")

    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.title('BIRCH Clustering Results')
    plt.legend()
    plt.show()

# Assuming cf_tree is the root of your BIRCH clustering result
plot_clusters(cf_tree)

In [None]:
# Collecting Centroids: Extract Leaf Node Centroids from the BIRCH CF Tree
centroids = collect_leaf_centroids(cf_tree)
array_centroids = np.array(centroids)
x, y = zip(*data)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(x, y, c="blue", label='Data Points')
plt.scatter(array_centroids[:, 0], array_centroids[:, 1], c="red", label='Centroids', marker='X')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Data and Centroids")
plt.legend()

In [None]:
K = len(array_centroids)  # The number of clusters K is set to the number of centroids from BIRCH
max_iterations = 100

In [None]:
# Final plot showing clusters
clusters, k_array_centriods = k_means(data, K, array_centroids, max_iterations)
plt.figure(figsize=(12, 10))
for i in range(K):
    points = np.array([data[j] for j in range(len(data)) if clusters[j] == i])
    if points.any():
        plt.scatter(points[:, 0], points[:, 1], s=50, c="#"+str(random.randint(100000, 999999)), label=f'Cluster {i+1}')
plt.scatter(k_array_centriods[:, 0], k_array_centriods[:, 1], s=100, c='red', label='Centroids', marker='X')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Clusters and Centroids after K-Means")
plt.legend()
plt.axis('equal')
plt.show()