In [1]:
# @title
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import precision_score
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from matplotlib.patches import Patch
from matplotlib.lines import Line2D


def read_csv_to_array(file_path, has_header):
    """
    Reads a CSV file and converts it to a pandas DataFrame.
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Extract header information if it exists
    if has_header:
        header = df.columns
    else:
        header = None

    # Return the DataFrame instead of converting it to NumPy array
    return df, header

def kmeans_clustering_and_visualization(file_path, k, data_columns, label_column, has_header=True):
    """
    Perform k-means clustering on the dataset from a CSV file, visualize the result using pair plots
    where the ground truth is represented by point shapes (encoded string labels), and the computed clusters
    are represented by colors. Also, compute precision score, with an external legend.

    Parameters:
    file_path: str, the path to the CSV file
    k: int, the number of clusters to compute
    has_header: bool, whether the CSV file has a header or not

    Returns:
    precision: float, the precision score of the clustering.
    """
    print('---------------------------------------------------------------k-Means---------------------------------------------------------------')

    # Load the dataset
    df, header = read_csv_to_array(file_path, has_header)

    # Extract labels (species) and encode them to numerical values
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df[label_column])

    # Extract the feature columns
    data = df.loc[:, data_columns].values

    # Perform k-means clustering
    kmeans = KMeans(n_clusters=k)
    predicted_labels = kmeans.fit_predict(data)

    # Visualize the clustering results using pairplots
    features = data.shape[1]
    fig, axs = plt.subplots(features, features, figsize=(15, 15))
    unique_labels = np.unique(labels)
    marker_styles = ['o', 's', '^','x','<','>','d','h','*','+']  # Different markers for species
    colors = ['C0', 'C1', 'C2','C3','C4','C5','C6','C7','C8','C9']  # Colors for clusters

    for (x_idx, y_idx) in combinations(range(features), 2):
        ax = axs[x_idx, y_idx]

        # Use header names for labeling axes, if available
        x_label = header[x_idx] if has_header else f"Feature {x_idx}"
        y_label = header[y_idx] if has_header else f"Feature {y_idx}"

        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)

        # Plot each ground truth label with a different marker and color
        for j, label in enumerate(unique_labels):
            label_mask = labels == label
            ax.scatter(data[label_mask, x_idx], data[label_mask, y_idx],
                       c=[colors[p] for p in predicted_labels[label_mask]],  # Color for clusters
                       marker=marker_styles[j], label=f"Species: {label_encoder.inverse_transform([label])[0]}", alpha=0.7)

    # Create legend for ground truth (species) and clusters
    # Markers for species (ground truth)
    species_legend = [
        Line2D([0], [0], marker=marker_styles[j], color='w', markerfacecolor='k', markersize=10,
               label=f"Species: {label_encoder.inverse_transform([label])[0]}")
        for j, label in enumerate(unique_labels)
    ]

    # Colors for clusters
    cluster_legend = [
        Patch(facecolor=colors[i], edgecolor='w', label=f"Cluster {i}")
        for i in range(k)
    ]

    # Combine legends for species and clusters
    combined_legend = species_legend + cluster_legend

    # Place the combined legend outside the plot
    fig.legend(handles=combined_legend, loc='center right', borderaxespad=0.1,
               title="Legend", fontsize='large', title_fontsize='x-large')

    plt.tight_layout()
    plt.subplots_adjust(right=0.85)  # Adjust plot to make room for the legend
    plt.show()

    # Compute precision score
    precision = precision_score(labels, predicted_labels, average='micro')
    print("Precision Score:", precision)

    return None




def gmm_clustering_and_visualization(file_path, k, data_columns, label_column, has_header=True):
    """
    Perform Gaussian Mixture Model (GMM) clustering on the dataset from a CSV file, visualize the result using pair plots
    where the ground truth is represented by point shapes (encoded string labels), and the computed clusters
    are represented by colors. Also, compute precision score, with an external legend.

    Parameters:
    file_path: str, the path to the CSV file
    k: int, the number of clusters to compute
    has_header: bool, whether the CSV file has a header or not

    Returns:
    precision: float, the precision score of the clustering.
    """
    print('---------------------------------------------------------------GMM---------------------------------------------------------------')

    # Load the dataset
    df, header = read_csv_to_array(file_path, has_header)

    # Extract labels (species) and encode them to numerical values
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df[label_column])

    # Extract the feature columns
    data = df.loc[:, data_columns].values

    # Perform Gaussian Mixture Model (GMM) clustering
    gmm = GaussianMixture(n_components=k)
    predicted_labels = gmm.fit_predict(data)

    # Visualize the clustering results using pairplots
    features = data.shape[1]
    fig, axs = plt.subplots(features, features, figsize=(15, 15))
    unique_labels = np.unique(labels)
    marker_styles = ['o', 's', '^','x','<','>','d','h','*','+']  # Different markers for species
    colors = ['C0', 'C1', 'C2','C3','C4','C5','C6','C7','C8','C9']  # Colors for clusters

    for (x_idx, y_idx) in combinations(range(features), 2):
        ax = axs[x_idx, y_idx]

        # Use header names for labeling axes, if available
        x_label = header[x_idx] if has_header else f"Feature {x_idx}"
        y_label = header[y_idx] if has_header else f"Feature {y_idx}"

        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)

        # Plot each ground truth label with a different marker and color
        for j, label in enumerate(unique_labels):
            label_mask = labels == label
            ax.scatter(data[label_mask, x_idx], data[label_mask, y_idx],
                       c=[colors[p] for p in predicted_labels[label_mask]],  # Color for clusters
                       marker=marker_styles[j], label=f"Species: {label_encoder.inverse_transform([label])[0]}", alpha=0.7)

    # Create legend for ground truth (species) and clusters
    # Markers for species (ground truth)
    species_legend = [
        Line2D([0], [0], marker=marker_styles[j], color='w', markerfacecolor='k', markersize=10,
               label=f"Species: {label_encoder.inverse_transform([label])[0]}")
        for j, label in enumerate(unique_labels)
    ]

    # Colors for clusters
    cluster_legend = [
        Patch(facecolor=colors[i], edgecolor='w', label=f"Cluster {i}")
        for i in range(k)
    ]

    # Combine legends for species and clusters
    combined_legend = species_legend + cluster_legend

    # Place the combined legend outside the plot
    fig.legend(handles=combined_legend, loc='center right', borderaxespad=0.1,
               title="Legend", fontsize='large', title_fontsize='x-large')

    plt.tight_layout()
    plt.subplots_adjust(right=0.85)  # Adjust plot to make room for the legend
    plt.show()

    # Compute precision score
    precision = precision_score(labels, predicted_labels, average='micro')
    print("Precision Score:", precision)

    return None

In [None]:
file_path = 'iris.csv' #'penguins.csv'

kmeans_clustering_and_visualization(file_path, 3, ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'], 'Species')

gmm_clustering_and_visualization(file_path, 3, ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'], 'Species')