# Clustering

In [None]:
!pip install scikit-learn
!pip install -U kaleido
import kaleido
import numpy as np
import os
from sklearn.cluster import Birch, OPTICS, MeanShift
from sklearn.metrics import silhouette_score, silhouette_samples
import plotly.graph_objects as go
from tqdm import tqdm

## Load metrics and combine them




In [None]:
dimensions = np.load("/content/dimensions.npy")
normalized_gradient = np.load("/content/normalized_gradient.npy")
normalized_degree = np.load("/content/normalized_degree.npy")
normalized_betweenness = np.load("/content/estimated_betweenness.npy")

# Assuming all three vectors have the same length
data = np.column_stack((normalized_gradient, normalized_degree, normalized_betweenness))

## Perform Clustering

### BIRCH

In [None]:
# Create a BIRCH object with desired parameters
brc = Birch(n_clusters=None, threshold=0.006, branching_factor=50)  # AlexNet
# brc = Birch(n_clusters=None, threshold=0.0055, branching_factor=50) # VGG

# n_clusters=None: This parameter specifies the desired number of clusters. Setting it to None means that the algorithm will try to automatically determine the optimal number of clusters based on the threshold and branching_factor parameters.
# threshold=0.1: This parameter controls the maximum distance between two data points for them to be considered part of the same cluster during the initial clustering phase. A smaller threshold will result in more, smaller clusters.
# branching_factor=50: This parameter determines the maximum number of data points that can be stored in each node of the clustering tree built by the algorithm. A larger branching_factor can improve performance but may also increase memory usage.

# Fit the model to your data
brc.fit(data)

# Get cluster labels for each data point
brc_labels = brc.labels_

# Get the number of clusters found
brc_n_clusters_ = len(set(brc_labels))

print(f"Number of clusters found: {brc_n_clusters_}")
print("Cluster labels:", brc_labels)

if not os.path.isdir('./output/'):
        os.makedirs('./output')

if(brc_n_clusters_ > 1):
    brc_silhouette_score = silhouette_score(data, brc_labels)
    print(f"Birch Silhouette Score: {brc_silhouette_score}")

    with open("./output/birch_silhouette.txt", "w") as f:
        f.write(f"Birch Silhouette Score: {brc_silhouette_score}")
else:
    print("No clusters found.")
    with open("./output/birch_silhouette.txt", "w") as f:
        f.write("No clusters found.")


In [None]:
# VGG
# threshold=0.003, branching_factor=50 -> Birch Silhouette Score: 0.3829386426838363
# threshold=0.005, branching_factor=50 -> Birch Silhouette Score: 0.441202207247106
# threshold=0.0055, branching_factor=40 -> Birch Silhouette Score: 0.6105407601787337
# threshold=0.0055, branching_factor=45 -> Birch Silhouette Score: 0.6105407601787337
# threshold=0.0055, branching_factor=50 -> Birch Silhouette Score: 0.6105407601787337     !!
# threshold=0.0055, branching_factor=55 -> Birch Silhouette Score: 0.5439725211155807
# threshold=0.00555, branching_factor=50 -> Birch Silhouette Score: 0.5439725211155807

In [None]:
# AlexNet
# threshold=0.0009, branching_factor=50 -> Birch Silhouette Score: 0.3145571051208082
# threshold=0.001, branching_factor=50 -> Birch Silhouette Score: 0.33439062996942615
# threshold=0.002, branching_factor=50 -> Birch Silhouette Score: 0.3317107114269574
# threshold=0.003, branching_factor=50 -> Birch Silhouette Score: 0.3450148471350093
# threshold=0.004, branching_factor=50 -> Birch Silhouette Score: 0.35959601886342196
# threshold=0.005, branching_factor=50 -> Birch Silhouette Score: 0.4015941252813832
# threshold=0.006, branching_factor=50 -> Birch Silhouette Score: 0.4108755805633992      !!
# threshold=0.007, branching_factor=50 -> Birch Silhouette Score: 0.3817190935016746

In [None]:
brc_unique_clusters = np.unique(brc_labels)
brc_clustered_normalized_gradient = [normalized_gradient[brc_labels == cluster] for cluster in tqdm(brc_unique_clusters)]
brc_clustered_normalized_degree = [normalized_degree[brc_labels == cluster] for cluster in tqdm(brc_unique_clusters)]
brc_clustered_normalized_betweenness = [normalized_betweenness[brc_labels == cluster] for cluster in tqdm(brc_unique_clusters)]

### MeanShift

In [None]:
# Create a MeanShift object
ms = MeanShift(bandwidth=None, bin_seeding=True)

# bandwidth=None: bandwidth is a crucial parameter for MeanShift. It determines the radius around each data point that is considered when searching for density peaks. Setting it to None means that the algorithm will automatically estimate the optimal bandwidth based on the data itself.
# bin_seeding=True: This parameter is an optimization technique. Setting it to True means that the algorithm will first discretize the data into bins before searching for density peaks. This can speed up the process, especially for large datasets.

# Fit the model to your data
ms.fit(data)

# Get cluster labels for each data point
ms_labels = ms.labels_

# Get the number of clusters found
ms_n_clusters_ = len(set(ms_labels))

print(f"MeanShift: Number of clusters found: {ms_n_clusters_}")
print("MeanShift: Cluster labels:", ms_labels)

if not os.path.isdir('./output/'):
        os.makedirs('./output')

if(ms_n_clusters_ > 1):
    ms_silhouette_score = silhouette_score(data, ms_labels)
    print(f"Mean Shift Silhouette Score: {ms_silhouette_score}")

    with open("./output/meanshift_silhouette.txt", "w") as f:
        f.write(f"Mean Shift Silhouette Score: {ms_silhouette_score}")
else:
    print("No clusters found.")
    with open("./output/meanshift_silhouette.txt", "w") as f:
        f.write("No clusters found.")

In [None]:
ms_unique_clusters = np.unique(ms_labels)
ms_clustered_normalized_gradient = [normalized_gradient[ms_labels == cluster] for cluster in tqdm(ms_unique_clusters)]
ms_clustered_normalized_degree = [normalized_degree[ms_labels == cluster] for cluster in tqdm(ms_unique_clusters)]
ms_clustered_normalized_betweenness = [normalized_betweenness[ms_labels == cluster] for cluster in tqdm(ms_unique_clusters)]

## Visualize Clusters in a 3D graph

### BIRCH

In [None]:
# Create a 3D scatter plot with Plotly
fig = go.Figure()

for i in range(brc_n_clusters_):
    fig.add_trace(go.Scatter3d(
        x=brc_clustered_normalized_gradient[i],
        y=brc_clustered_normalized_degree[i],
        z=brc_clustered_normalized_betweenness[i],
        mode='markers',
        marker=dict(
            size=2,
            color=i,  # Color points by cluster
            colorscale='Viridis',  # Choose a colormap
            opacity=0.8
        ),
        name="Cluster "+str(i+1),
        showlegend=True
    ))

# Customize the plot
fig.update_layout(
    title='3D Clustering Visualization with Birch',
    scene=dict(
        xaxis_title='Normalized Gradient',
        yaxis_title='Normalized Degree',
        zaxis_title='Normalized Betweenness'
    ),
    legend=dict(
        x=0,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=13,
            color="black"
        )
    )
)

if not os.path.isdir('./output/'):
    os.makedirs('./output')

# Save the figure to an HTML file
fig.write_html("./output/3D_Graph_Clustering_Birch.html")

# Save the figure to an image file (e.g., PNG)
fig.write_image("./output/3D_Graph_Clustering_Birch.png")

# Show the interactive plot
fig.show()

### MeanShift

In [None]:
# Create a 3D scatter plot with Plotly
fig = go.Figure()

for i in range(ms_n_clusters_):
    fig.add_trace(go.Scatter3d(
        x=ms_clustered_normalized_gradient[i],
        y=ms_clustered_normalized_degree[i],
        z=ms_clustered_normalized_betweenness[i],
        mode='markers',
        marker=dict(
            size=2,
            color=i,  # Color points by cluster
            colorscale='Viridis',  # Choose a colormap
            opacity=0.8
        ),
        name="Cluster "+str(i+1),
        showlegend=True
    ))

# Customize the plot
fig.update_layout(
    title='3D Clustering Visualization with Mean Shift',
    scene=dict(
        xaxis_title='Normalized Gradient',
        yaxis_title='Normalized Degree',
        zaxis_title='Normalized Betweenness'
    ),
    legend=dict(
        x=0,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=13,
            color="black"
        )
    )
)

if not os.path.isdir('./output/'):
    os.makedirs('./output')

# Save the figure to an HTML file
fig.write_html("./output/3D_Graph_Clustering_MeanShift.html")

# Save the figure to an image file (e.g., PNG)
fig.write_image("./output/3D_Graph_Clustering_MeanShift.png")

# Show the interactive plot
fig.show()

## Plot features dividing them by layer

In [None]:
layer_labels = np.full(dimensions[0],0)
layer_labels = np.concatenate((layer_labels,np.full(dimensions[2],1)))
layer_labels = np.concatenate((layer_labels,np.full(dimensions[4],2)))
layer_labels = np.concatenate((layer_labels,np.full(dimensions[5],3)))
unique_layer_labels = np.unique(layer_labels)

layer_normalized_gradient = [normalized_gradient[layer_labels == cluster] for cluster in tqdm(unique_layer_labels)]
layer_normalized_degree = [normalized_degree[layer_labels == cluster] for cluster in tqdm(unique_layer_labels)]
layer_normalized_betweenness = [normalized_betweenness[layer_labels == cluster] for cluster in tqdm(unique_layer_labels)]

In [None]:
# Create a 3D scatter plot with Plotly
fig = go.Figure()

for i in range(len(unique_layer_labels)):
    fig.add_trace(go.Scatter3d(
        x=layer_normalized_gradient[i],
        y=layer_normalized_degree[i],
        z=layer_normalized_betweenness[i],
        mode='markers',
        marker=dict(
            size=2,
            color=i,  # Color points by cluster
            colorscale='Viridis',  # Choose a colormap
            opacity=0.8
        ),
        name="Layer "+str(i),
        showlegend=True
    ))

# Customize the plot
fig.update_layout(
    title='3D Visualization of features divided by layer',
    scene=dict(
        xaxis_title='Normalized Gradient',
        yaxis_title='Normalized Degree',
        zaxis_title='Normalized Betweenness'
    ),
    legend=dict(
        x=0,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=13,
            color="black"
        )
    )
)

if not os.path.isdir('./output/'):
    os.makedirs('./output')

# Save the figure to an HTML file
fig.write_html("./output/3D_Graph_Features_Layers.html")

# Save the figure to an image file (e.g., PNG)
fig.write_image("./output/3D_Graph_Features_Layers.png")

# Show the interactive plot
fig.show()

## Compress Output Folder

In [None]:
!zip -r /content/Clustering_Results.zip /content/output