In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
import os
import shutil
import torch
import numpy as np
import umap
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import logging
import json

# Configure logging
logging.basicConfig(
    filename='clustering_errors.log',
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

import plotly.graph_objects as go

def plot_elbow_and_wcss(wcss, wcss_scores, max_clusters):
    """
    Creates interactive visualizations for the Elbow Method and WCSS Scores.

    Args:
        wcss (list): List of WCSS values for each cluster count.
        wcss_scores (list): List of WCSS scores for each cluster count.
        max_clusters (int): Maximum number of clusters considered.
    """
    if max_clusters == 56:
        # Define the range of cluster values from 2 to max_clusters
        k_values = list(range(2, max_clusters + 1))
        
        # Elbow plot
        fig1 = go.Figure()
        fig1.add_trace(go.Scatter(
            x=k_values, y=wcss, mode='lines+markers',
            name='Inertia', marker=dict(color='blue')
        ))
        fig1.update_layout(
            title="Elbow Method for Optimal Clusters",
            xaxis_title="Number of Clusters (k)",
            yaxis_title="Inertia",
            xaxis=dict(tickmode='linear')
        )
        fig1.show()

        # WCSS scores plot
        fig2 = go.Figure()
        fig2.add_trace(go.Scatter(
            x=k_values, y=wcss_scores, mode='lines+markers',
            name='WCSS Scores', marker=dict(color='green')
        ))
        fig2.update_layout(
            title="WCSS Scores for Optimal Clusters",
            xaxis_title="Number of Clusters (k)",
            yaxis_title="WCSS Scores",
            xaxis=dict(tickmode='linear')
        )
        fig2.show()


def plot_interactive_clusters(embeddings, labels, filenames, centroids=None, method="umap"):
    """
    Creates an interactive scatter plot of clustered embeddings with image IDs.

    Args:
        embeddings (np.ndarray): 2D array of reduced embeddings.
        labels (np.ndarray): Cluster labels for each embedding.
        filenames (list): List of image filenames corresponding to embeddings.
        centroids (np.ndarray, optional): Cluster centroids in 2D space.
        method (str): Dimensionality reduction method used (e.g., "umap" or "tsne").
    """
    df = pd.DataFrame({
        'x': embeddings[:, 0],
        'y': embeddings[:, 1],
        'cluster': labels.astype(str),  # Convert to string for color mapping
        'image_id': filenames           # Add filenames to hover data
    })

    fig = px.scatter(
        df,
        x='x', y='y',
        color='cluster',
        hover_data=['cluster', 'image_id'],  # Hover includes cluster and image ID
        title=f"Cluster Visualization ({method.upper()})",
        color_discrete_sequence=px.colors.qualitative.G10
    )

    # Add centroids if available
    if centroids is not None:
        centroid_df = pd.DataFrame({
            'x': centroids[:, 0],
            'y': centroids[:, 1],
            'cluster': ['Centroid'] * len(centroids)
        })
        fig.add_scatter(
            x=centroid_df['x'], y=centroid_df['y'],
            mode='markers',
            marker=dict(size=15, symbol='x', color='red'),
            name='Centroid'
        )

    fig.update_layout(showlegend=True)
    fig.show()

def save_clustered_images(cluster_mapping, folder_path, destination_folder):
    """
    Saves images into subfolders by cluster.

    Args:
        cluster_mapping (dict): Mapping of cluster IDs to image filenames.
        folder_path (str): Path to the original images.
        destination_folder (str): Path to the destination folder.
    """
    os.makedirs(destination_folder, exist_ok=True)
    for cluster, files in cluster_mapping.items():
        cluster_folder = os.path.join(destination_folder, f"Cluster_{cluster}")
        os.makedirs(cluster_folder, exist_ok=True)

        for file in files:
            source_path = os.path.join(folder_path, file)
            destination_path = os.path.join(cluster_folder, file)
            try:
                shutil.copy(source_path, destination_path)
            except Exception as e:
                logging.error(f"Error copying {file} to {cluster_folder}: {e}")

def process_and_cluster_embeddings(file_path, n_clusters, folder_path, destination_folder, max_images=10, method="umap", max_clusters=56):
    """
    Processes embeddings, performs clustering, visualizes results, and organizes images by cluster.

    Args:
        file_path (str): Path to the embeddings file.
        n_clusters (int): Number of clusters.
        folder_path (str): Path to the folder containing original images.
        destination_folder (str): Path to save clustered images.
        max_images (int): Maximum number of images to display for a cluster.
        method (str): Dimensionality reduction method ('umap' or 'tsne').
        max_clusters (int): Maximum number of clusters to test for optimal clustering.

    Returns:
        dict: Evaluation metrics for the clustering.
    """
    # Load embeddings and filenames
    data = torch.load(file_path)
    embeddings = np.array([item['embedding'] for item in data])
    filenames = [item['filename'] for item in data]

    wcss = []
    wcss_scores = []

    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=172391)
        labels = kmeans.fit_predict(embeddings)
        
        # WCSS is equivalent to inertia
        wcss.append(kmeans.inertia_)
        
        # Optional: Calculate silhouette scores or other metrics as needed
        wcss_scores.append(silhouette_score(embeddings, labels))


    # Plot Elbow and Silhouette scores
    plot_elbow_and_wcss(wcss, wcss_scores, max_clusters)

    # Perform clustering with user-defined n_clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=172391)
    labels = kmeans.fit_predict(embeddings)

    # Dimensionality reduction
    reducer = umap.UMAP(n_components=2, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)
    reduced_centroids = reducer.transform(kmeans.cluster_centers_)

    # Interactive cluster visualization
    plot_interactive_clusters(reduced_embeddings, labels, filenames, reduced_centroids, method=method)

    # Organize and save images by cluster
    cluster_mapping = {i: [] for i in range(n_clusters)}
    for label, filename in zip(labels, filenames):
        cluster_mapping[label].append(filename)

    save_clustered_images(cluster_mapping, folder_path, destination_folder)

    # Evaluate clustering performance
    if n_clusters > 3:
        with open(r"C:\Users\leonc\Thesis 2024\Toxic-Symbology\tools\result_list.json", "r") as file:
            result_list = json.load(file) 
        results = {
            'Inertia': kmeans.inertia_,
            'Silhouette Score': silhouette_score(embeddings, labels) if len(np.unique(labels)) > 1 else None,
            'ARI': adjusted_rand_score(result_list, labels),
            'NMI': normalized_mutual_info_score(result_list, labels, average_method='arithmetic')
        }
    elif n_clusters == 3: 
        results = {
            'Inertia': kmeans.inertia_,
            'Silhouette Score': silhouette_score(embeddings, labels) if len(np.unique(labels)) > 1 else None,
            'ARI': adjusted_rand_score([0]* 1029 + [1] * 500, labels),
            'NMI': normalized_mutual_info_score([0]* 1029 + [1] * 500, labels, average_method='arithmetic')
        }
    elif n_clusters == 2: 
        results = {
            'Inertia': kmeans.inertia_,
            'Silhouette Score': silhouette_score(embeddings, labels) if len(np.unique(labels)) > 1 else None,
            'ARI': adjusted_rand_score([0]* 500 + [1] * 500 + [2] * 529, labels),
            'NMI': normalized_mutual_info_score([0]* 500 + [1] * 500 + [2] * 529, labels, average_method='arithmetic')
        }

    return results

In [14]:
# Example usage
if __name__ == "__main__":
    embeddings_file = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/Image Clustering/toxic_embeddings.pt"
    folder_path = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/complete expansion/toxics_symbol_images_filtered_renamed"
    destination_folder = "56_clusters"
    n_clusters = 56  # Adjust based on your needs

    # Run the full pipeline
    results = process_and_cluster_embeddings(
        embeddings_file, n_clusters, folder_path, destination_folder, max_images=50, method="umap", max_clusters=56
    )
    
    # Output evaluation results
    print(results)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



KeyboardInterrupt: 

In [None]:
# Example usage
if __name__ == "__main__":
    embeddings_file = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/Image Clustering/toxic_embeddings.pt"
    folder_path = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/complete expansion/toxics_symbol_images_filtered_renamed"
    destination_folder = "40_clusters"
    n_clusters = 40  # Adjust based on your needs

    # Run the full pipeline
    results = process_and_cluster_embeddings(
        embeddings_file, n_clusters, folder_path, destination_folder, max_images=50, method="umap", max_clusters=40
    )
    
    # Output evaluation results
    print(results)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



{'Inertia': 21855.2421875, 'Silhouette Score': 0.024774827, 'ARI': 0.08674229061010263, 'NMI': 0.49263859703766044}


In [None]:
# Example usage
if __name__ == "__main__":
    embeddings_file = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/Image Clustering/toxic_embeddings.pt"
    folder_path = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/complete expansion/toxics_symbol_images_filtered_renamed"
    destination_folder = "30_clusters"
    n_clusters = 30 # Adjust based on your needs

    # Run the full pipeline
    results = process_and_cluster_embeddings(
        embeddings_file, n_clusters, folder_path, destination_folder, max_images=50, method="umap", max_clusters=30
    )
    
    # Output evaluation results
    print(results)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



{'Inertia': 22602.9296875, 'Silhouette Score': 0.028449774, 'ARI': 0.08707846550310154, 'NMI': 0.4631526249092337}


In [11]:
# Example usage
if __name__ == "__main__":
    embeddings_file = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/Image Clustering/toxic_embeddings.pt"
    folder_path = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/complete expansion/toxics_symbol_images_filtered_renamed"
    destination_folder = "20_clusters"
    n_clusters = 20  # Adjust based on your needs

    # Run the full pipeline
    results = process_and_cluster_embeddings(
        embeddings_file, n_clusters, folder_path, destination_folder, max_images=50, method="umap", max_clusters=20
    )
    
    # Output evaluation results
    print(results)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



{'Inertia': 23912.984375, 'Silhouette Score': 0.032086268, 'ARI': 0.0708116274358607, 'NMI': 0.4023533245655003}


In [7]:
# Example usage
if __name__ == "__main__":
    embeddings_file = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/Image Clustering/all_embeddings.pt"
    folder_path = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/complete expansion/combined_images_filtered_renamed"
    destination_folder = "2_clusters"
    n_clusters = 2 # Adjust based on your needs

    # Run the full pipeline
    results = process_and_cluster_embeddings(
        embeddings_file, n_clusters, folder_path, destination_folder, max_images=50, method="umap", max_clusters=2
    )
    
    # Output evaluation results
    print(results)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



{'Inertia': 73479.0859375, 'Silhouette Score': 0.07849856, 'ARI': 0.38282856319390224, 'NMI': 0.4132777048564306}


In [8]:
# Example usage
if __name__ == "__main__":
    embeddings_file = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/Image Clustering/all_embeddings.pt"
    folder_path = "C:/Users/leonc/Thesis 2024/Toxic-Symbology/complete expansion/combined_images_filtered_renamed"
    destination_folder = "3_clusters"
    n_clusters = 3  # Adjust based on your needs

    # Run the full pipeline
    results = process_and_cluster_embeddings(
        embeddings_file, n_clusters, folder_path, destination_folder, max_images=50, method="umap", max_clusters=3
    )
    
    # Output evaluation results
    print(results)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



{'Inertia': 71160.03125, 'Silhouette Score': 0.029954152, 'ARI': 0.10900330141176524, 'NMI': 0.17422777194931632}


In [11]:
import os

_, _, files = next(os.walk(r"C:\Users\leonc\Thesis 2024\Toxic-Symbology\Image Clustering\2_clusters\Cluster_1"))
print(len(files))

988
