In [20]:
# minimize(Cost = percent of dataset with < 5% cluster label confidence)
# Subject #to: 30 < num_clusters < 100

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load embeddings
embeddings = np.load(r'..\bertopic\preprocessed_data\embeddings.csv.npy')

In [22]:
from umap import UMAP
from hdbscan import HDBSCAN

def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      min_samples,
                      cluster_selection_epsilon,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                min_dist=0,
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = HDBSCAN(min_cluster_size = min_cluster_size,
                               min_samples=min_samples,
                               cluster_selection_epsilon=cluster_selection_epsilon,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

In [23]:

def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost

In [28]:
from tqdm import tqdm
import random

def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in tqdm(range(num_evals)):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        min_samples = random.choice(space['min_samples'])
        cluster_selection_epsilon = random.choice(space['cluster_selection_epsilon'])
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size,
                                     min_samples = min_samples,
                                     cluster_selection_epsilon = cluster_selection_epsilon,
                                     random_state = 42)
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, min_samples, cluster_selection_epsilon,
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'min_samples', 'cluster_selection_epsilon', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [29]:
space = {
    'n_neighbors': range(5,30),
    'n_components': range(3,20),
    'min_cluster_size': range(5,30),
    'min_samples': range(1,10),
    'cluster_selection_epsilon': [i / 100 for i in range(1, 20)],
    'random_state': 42
}
random_use = random_search(embeddings, space, 100)

100%|██████████| 100/100 [38:08<00:00, 22.89s/it]


In [31]:
# See top models
random_use.head(25)

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,min_samples,cluster_selection_epsilon,label_count,cost
79,79,12,4,26,8,0.01,5,0.0
26,26,23,18,27,8,0.1,2,0.0
56,56,25,15,28,5,0.17,2,0.0
94,94,18,8,26,8,0.12,2,0.0
95,95,25,12,26,6,0.02,2,0.0
48,48,25,17,23,4,0.1,2,0.0
90,90,11,7,27,7,0.12,6,0.000546
58,58,10,18,23,7,0.09,7,0.003495
21,21,8,14,25,8,0.07,7,0.003604
65,65,13,12,27,8,0.08,6,0.00415
