In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import silhouette_score, davies_bouldin_score
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
import umap.umap_ as umap
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
X_train = pd.read_csv('data/umist_cropped.csv')

In [3]:
def apply_scaling(data, method='standard',):
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError("Invalid scaling method. Choose 'standard', 'minmax', or 'robust'.")
    
    return scaler.fit_transform(data)


def apply_reduction(data, method='pca', random_state=42):
    if method == 'pca':
        reducer = PCA(n_components=0.99,random_state=random_state)
    elif method == 'umap':
        reducer = Pipeline([
            ('pca', PCA(n_components=0.99,random_state=random_state)),
            ('umap', umap.UMAP(n_components=10,random_state=random_state))
        ])
    else:
        raise ValueError("Invalid reduction method. Choose 'pca', or 'umap'.")
    
    return reducer.fit_transform(data)

In [4]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

def cluster_kmeans(data, n_clusters=20, random_state=42, init='k-means++',n_init=50):
    model = KMeans(n_clusters=n_clusters, random_state=random_state, init=init, n_init=n_init)
    labels = model.fit_predict(data)
    score = silhouette_score(data, labels, metric='euclidean')
    return score


In [5]:
scaling_methods = ['standard', 'minmax', 'robust']
reduction_methods = ['pca', 'umap']

results = []

for scale in scaling_methods:
    data_scaled = apply_scaling(X_train, method=scale)
    for reduction in reduction_methods:
        data_scaled = apply_reduction(data_scaled, method=reduction)
        score = cluster_kmeans(data_scaled)
        print(f'kmeans {scale}, {reduction}, {score}')
        results.append({
            'Scaling': scale,
            'Reduction': reduction,
            'Clustering': 'kmeans',
            'Silhouette Score': score
        })


kmeans standard, pca, 0.11159319350371254
kmeans standard, umap, 0.484418123960495
kmeans minmax, pca, 0.11887469269726746
kmeans minmax, umap, 0.4843291640281677
kmeans robust, pca, 0.11138358232030555
kmeans robust, umap, 0.4910973608493805


In [6]:
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, squareform, euclidean, cityblock, cosine, minkowski
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster


def cluster_hierarchical(data, method='average', metric='euclidean'):
    # Compute the distance matrix
    distance_matrix = pdist(data, metric=metric)
    # Perform hierarchical clustering
    distances = linkage(distance_matrix, method=method)
    # Form clusters with a max of 20 clusters
    clusters = fcluster(distances, 20, criterion='maxclust')
    # Compute the silhouette score
    score = silhouette_score(data, clusters, metric=metric)
    return score

In [10]:
scaling_methods = ['standard', 'minmax']
reduction_methods = ['pca', 'umap']
metrics = ['minkowski', 'cosine', 'cityblock', 'euclidean']
methods = ['centroid', 'single', 'complete', 'average']

results = []

# Collect results
for scale in scaling_methods:
    data_scaled = apply_scaling(X_train, method=scale)
    for reduction in reduction_methods:
        data_scaled = apply_reduction(data_scaled, method=reduction)
        for metric in metrics:
            for method in methods:
                score = cluster_hierarchical(data_scaled, method=method, metric=metric)
                results.append({
                    'Scaling': scale,
                    'Reduction': reduction,
                    'Clustering': f'hierarchical ({metric}, {method})',
                    'Silhouette Score': score,
                })

# Sort the results by Silhouette Score or DBCV in descending order
results_sorted = sorted(results, key=lambda x: x['Silhouette Score'], reverse=True)

# Print sorted results
for result in results_sorted:
    print(f"hierarchical {result['Clustering']}, {result['Scaling']}, {result['Reduction']}, Silhouette Score: {result['Silhouette Score']}")

hierarchical hierarchical (cosine, centroid), minmax, umap, Silhouette Score: 0.642193615436554
hierarchical hierarchical (cosine, average), standard, umap, Silhouette Score: 0.6352740526199341
hierarchical hierarchical (cosine, centroid), standard, umap, Silhouette Score: 0.6342313885688782
hierarchical hierarchical (cosine, average), minmax, umap, Silhouette Score: 0.6174703240394592
hierarchical hierarchical (cosine, complete), standard, umap, Silhouette Score: 0.6076905131340027
hierarchical hierarchical (cosine, complete), minmax, umap, Silhouette Score: 0.5709967017173767
hierarchical hierarchical (minkowski, centroid), standard, umap, Silhouette Score: 0.4780019744916986
hierarchical hierarchical (euclidean, centroid), standard, umap, Silhouette Score: 0.4780019521713257
hierarchical hierarchical (euclidean, average), standard, umap, Silhouette Score: 0.47655507922172546
hierarchical hierarchical (minkowski, average), standard, umap, Silhouette Score: 0.4765550534770971
hierarch

In [8]:
from sklearn.mixture import GaussianMixture

def cluster_gmm(data, cov_type = 'full'):
    # Create and fit the Gaussian Mixture Model
    model = GaussianMixture(n_components=20, random_state=42, covariance_type=cov_type)
    model.fit(data)
    # Predict the labels
    labels = model.predict(data)
    # Calculate the silhouette score
    score = silhouette_score(data, labels)
    return score

In [9]:
scaling_methods = ['standard', 'minmax']
covariance_types = ['full', 'tied', 'diag', 'spherical']

for scale in scaling_methods:
    data_scaled = apply_scaling(X_train, method=scale)
    for reduction in reduction_methods:
        for cov in covariance_types:
            data_reduced = apply_reduction(data_scaled, method=reduction)
            score = cluster_gmm(data_reduced, cov_type=cov)
            print(f'gmm {scale}, {reduction}, {score}')
            results.append({
                'Scaling': scale,
                'Reduction': reduction,
                'Clustering': 'GMM',
                'Silhouette Score': score
            })

gmm standard, pca, 0.10458112179464027
gmm standard, pca, 0.10458112179464027
gmm standard, pca, 0.09438252706175396
gmm standard, pca, 0.1013205287034697
gmm standard, umap, 0.4409905970096588
gmm standard, umap, 0.4289001524448395
gmm standard, umap, 0.4467089772224426
gmm standard, umap, 0.4419021010398865
gmm minmax, pca, 0.11067264190642666
gmm minmax, pca, 0.11088471544958428
gmm minmax, pca, 0.1091609557721799
gmm minmax, pca, 0.11084793008315155
gmm minmax, umap, 0.4899003207683563
gmm minmax, umap, 0.47197702527046204
gmm minmax, umap, 0.49154043197631836
gmm minmax, umap, 0.4908941686153412
