In [3]:
import pandas as pd
import numpy as np
from permetrics import ClusteringMetric

# Load embeddings
embeddings = np.load(r'..\bertopic\preprocessed_data\embeddings.csv.npy')

In [None]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

USE_GPU = True
try:
    from cuml.manifold import UMAP as cuUMAP
    from cuml.cluster import HDBSCAN as cuHDBSCAN
    from cuml.metrics import silhouette_score as gpu_silhouette_score
except Exception:
    USE_GPU = False
    from umap import UMAP as cpuUMAP
    from hdbscan import HDBSCAN as cpuHDBSCAN
from hdbscan import validity as hdbscan_validity

In [None]:
def _to_numpy(x):
    """Convert cuDF/CuPy/torch tensors to NumPy; pass-through for NumPy."""
    try:
        import cupy as cp
        if isinstance(x, cp.ndarray):
            return cp.asnumpy(x)
    except Exception:
        pass
    # cuDF DataFrame/Series
    try:
        import cudf
        if isinstance(x, cudf.DataFrame):
            return x.to_pandas().to_numpy()
        if isinstance(x, cudf.Series):
            return x.to_pandas().to_numpy()
    except Exception:
        pass
    # torch tensor
    try:
        import torch
        if isinstance(x, torch.Tensor):
            return x.detach().cpu().numpy()
    except Exception:
        pass
    return np.asarray(x)

def _gpu_silhouette_or_cpu(X, labels, metric="euclidean"):
    """Try GPU silhouette; fall back to sklearn if needed. Filters to clustered points."""
    mask = labels != -1
    if mask.sum() < 2 or len(np.unique(labels[mask])) < 2:
        return np.nan
    Xm = X[mask]
    lm = labels[mask]
    if USE_GPU:
        try:
            # Expect CuPy on GPU; if we have NumPy, just use CPU below
            import cupy as cp
            if isinstance(X, cp.ndarray):
                return float(gpu_silhouette_score(Xm, lm, metric=metric))
        except Exception:
            pass
    # CPU fallback
    from sklearn.metrics import silhouette_score as cpu_silhouette_score
    return float(cpu_silhouette_score(_to_numpy(Xm), _to_numpy(lm), metric=metric))

In [None]:
def build_bertopic(n_neighbors, n_components, min_cluster_size, min_samples,
                   cluster_selection_epsilon, seed=42,
                   vectorizer=None, embedding_model=None,
                   calculate_probabilities=False, verbose=False):
    if vectorizer is None:
        vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words="english")

    if USE_GPU:
        umap_model = cuUMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            metric="cosine",
            min_dist=0.0,
            random_state=seed,
            verbose=verbose,
        )
        hdbscan_model = cuHDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric="euclidean",
            cluster_selection_method="eom",
            cluster_selection_epsilon=cluster_selection_epsilon,
            prediction_data=True,
        )
    else:
        umap_model = cpuUMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            metric="cosine",
            min_dist=0.0,
            random_state=seed,
            verbose=verbose,
        )
        hdbscan_model = cpuHDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric="euclidean",
            cluster_selection_method="eom",
            cluster_selection_epsilon=cluster_selection_epsilon,
            prediction_data=True,
        )

    topic_model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer,
        embedding_model=embedding_model,  # pass your SentenceTransformer here
        calculate_probabilities=calculate_probabilities,
        verbose=verbose,
        nr_topics=None,   # no topic reduction/merging so results are reproducible
    )
    return topic_model

In [None]:
def dbcv_filtered(umap_embedding, topics_array):
    topics_array = np.asarray(topics_array)
    mask = topics_array != -1
    if mask.sum() < 2:
        return np.nan
    # need at least 2 clusters after filtering
    if len(np.unique(topics_array[mask])) < 2:
        return np.nan
    X = _to_numpy(umap_embedding)[mask].astype("float64", copy=False)
    y = topics_array[mask]
    try:
        return float(hdbscan_validity.validity_index(X, y))
    except Exception:
        return np.nan


In [None]:
def random_search_bertopic(docs, embeddings, space, num_evals=50, seed=42,
                           vectorizer=None, embedding_model=None, verbose=False):
    random.seed(seed)
    np.random.seed(seed)

    rows = []
    for i in tqdm(range(num_evals)):
        n_neighbors = random.choice(list(space['n_neighbors']))
        n_components = random.choice(list(space['n_components']))
        min_cluster_size = random.choice(list(space['min_cluster_size']))
        min_samples = random.choice(list(space['min_samples']))
        cluster_selection_epsilon = random.choice(list(space['cluster_selection_epsilon']))

        tm = build_bertopic(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon,
            seed=seed,
            vectorizer=vectorizer,
            embedding_model=embedding_model,
            calculate_probabilities=False,
            verbose=verbose,
        )

        # Fit exactly like you will analyze later
        topics, _ = tm.fit_transform(docs, embeddings)

        topics_arr = np.asarray(topics)
        # exact embedding produced by BERTopic's UMAP
        umap_emb = tm.umap_model.embedding_

        # Metrics
        # 1) DBCV on clustered points only
        dbcv = dbcv_filtered(umap_emb, topics_arr)

        # 2) Silhouette (backup/secondary), also filtered
        X_np = _to_numpy(umap_emb)  # ensure NumPy if we need CPU metric
        sil = _gpu_silhouette_or_cpu(X_np, topics_arr, metric="euclidean")

        # Counts
        n_topics = len(np.unique(topics_arr[topics_arr != -1]))
        noise_frac = float(np.mean(topics_arr == -1))

        rows.append({
            "run_id": i,
            "n_components": n_components,
            "n_neighbors": n_neighbors,
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "cluster_selection_epsilon": cluster_selection_epsilon,
            "n_topics": n_topics,
            "noise_frac": noise_frac,
            "dbcv": dbcv,
            "silhouette": sil,
        })

    df = pd.DataFrame(rows)
    # Primary sort by DBCV (desc), tie-breaker by silhouette (desc), then fewer noise
    df = df.sort_values(by=["dbcv", "silhouette", "noise_frac"], ascending=[False, False, True])
    return df

In [None]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("allenai-specter")
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

data = pd.read_csv(r'../get_data/alldata.csv')
docs = data['Abstract']
docs = docs.tolist()

space = {
    "n_neighbors": range(5, 30),
    "n_components": range(3, 30),
    "min_cluster_size": range(10, 50),   # allow some smaller clusters to form
    "min_samples": range(1, 20),
    "cluster_selection_epsilon": [i / 100 for i in range(1, 25)],
}

res = random_search_bertopic(
    docs, embeddings, space, num_evals=100, seed=42,
    vectorizer=vectorizer_model,
    embedding_model=sentence_model,
    verbose=False
)
res.to_csv(r'..\bertopic\preprocessed_data\tuning_results.csv', index=False)
res.head(20)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

Unnamed: 0,run_id,n_components,n_neighbors,min_cluster_size,min_samples,cluster_selection_epsilon,label_count,cost,davies_bouldin
27,27,11,9,96,9,0.17,3,0.003065,1.642143
50,50,15,8,78,12,0.02,3,0.003065,1.642266
0,0,8,25,99,8,0.06,3,0.003065,1.644685
86,86,9,23,61,9,0.15,3,0.003065,1.644685
32,32,19,15,69,17,0.17,3,0.003065,1.644685
33,33,12,25,75,5,0.15,3,0.003065,1.644685
78,78,19,21,65,9,0.09,3,0.003065,1.644685
69,69,17,9,72,9,0.07,3,0.003065,1.644685
94,94,12,18,70,19,0.19,3,0.003065,1.65078
28,28,16,29,62,11,0.12,3,0.003065,1.65078


In [14]:
random_use_with_davies_bouldin.sort_values(by='cost').head(50)

Unnamed: 0,run_id,n_components,n_neighbors,min_cluster_size,min_samples,cluster_selection_epsilon,label_count,cost,davies_bouldin
18,18,5,26,33,13,0.04,4,0.0,1.986719
43,43,11,26,63,14,0.07,4,0.003065,1.987555
26,26,3,10,95,6,0.14,4,0.003065,1.985266
79,79,3,18,88,17,0.19,4,0.003065,1.975303
59,59,5,29,89,13,0.12,3,0.003065,1.652785
28,28,16,29,62,11,0.12,3,0.003065,1.65078
94,94,12,18,70,19,0.19,3,0.003065,1.65078
27,27,11,9,96,9,0.17,3,0.003065,1.642143
78,78,19,21,65,9,0.09,3,0.003065,1.644685
33,33,12,25,75,5,0.15,3,0.003065,1.644685
