In [20]:
# minimize(Cost = percent of dataset with < 5% cluster label confidence)
# Subject #to: 30 < num_clusters < 100

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load embeddings
embeddings = np.load(r'..\bertopic\preprocessed_data\embeddings.csv.npy')

In [2]:
from umap import UMAP
from hdbscan import HDBSCAN

def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      min_samples,
                      cluster_selection_epsilon,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                min_dist=0,
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = HDBSCAN(min_cluster_size = min_cluster_size,
                               min_samples=min_samples,
                               cluster_selection_epsilon=cluster_selection_epsilon,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:

def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost

In [4]:
from tqdm import tqdm
import random

def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in tqdm(range(num_evals)):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        min_samples = random.choice(space['min_samples'])
        cluster_selection_epsilon = random.choice(space['cluster_selection_epsilon'])
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size,
                                     min_samples = min_samples,
                                     cluster_selection_epsilon = cluster_selection_epsilon,
                                     random_state = 42)
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, min_samples, cluster_selection_epsilon,
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'min_samples', 'cluster_selection_epsilon', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [5]:
space = {
    'n_neighbors': range(5,30),
    'n_components': range(3,20),
    'min_cluster_size': range(5,30),
    'min_samples': range(1,10),
    'cluster_selection_epsilon': [i / 100 for i in range(1, 20)],
    'random_state': 42
}
random_use = random_search(embeddings, space, 100)

100%|██████████| 100/100 [46:41<00:00, 28.02s/it]


In [6]:
# See top models
random_use.head(50)

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,min_samples,cluster_selection_epsilon,label_count,cost
61,61,9,8,23,8,0.1,6,0.00372
28,28,7,17,16,4,0.19,72,0.179122
83,83,24,12,22,1,0.19,25,0.182843
17,17,19,15,10,1,0.16,104,0.185031
66,66,5,6,16,1,0.13,157,0.189627
78,78,28,17,8,1,0.16,110,0.195098
6,6,26,4,20,3,0.18,28,0.218733
16,16,9,4,9,5,0.18,96,0.225079
59,59,28,14,27,2,0.18,29,0.230003
14,14,27,6,17,3,0.19,37,0.242587
