Grid search code to optimise clustering_analysis.py hyperparameters

In [None]:
import itertools
import os
import datetime
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
import hdbscan
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import sys
sys.path.append("../src")
from utils.stopwords import CUSTOM_STOPWORDS

In [None]:
# Load data
with open("../data/chunks.json") as f:
    data = json.load(f)

texts = [entry["text"] for entry in data]

# Make sure the models folder exists
os.makedirs("../models", exist_ok=True)
os.makedirs("../models/hp_optimisation", exist_ok=True)

In [None]:
# Only allow tokens with at least one alphabet character (i.e. excludes numbers)
token_pattern = r"(?u)\b[a-zA-Z]*[a-zA-Z][a-zA-Z]*\b"

custom_vectorizer = CountVectorizer(
    stop_words=CUSTOM_STOPWORDS, # Remove stopwords (set in src/stopwords.py)
    ngram_range=(1, 2),          # Include unigrams and bigrams
    token_pattern=token_pattern
)

In [None]:
# Lightweight sentence transformer from Hugging Face
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# Turns sentences into dense vectors 
# https://medium.com/@yasindusanjeewa8/dense-vectors-in-natural-language-processing-06818dff5cd7

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# ------------------------------------------------------------
# Define Grid Search Parameters
# ------------------------------------------------------------
n_neighbors_grid = [5, 10, 15]
min_cluster_size_grid = [10, 15, 20, 25, 30]
model_dir = "../models/hp_optimisation/"
os.makedirs(model_dir, exist_ok=True)
random_state = 42

In [None]:
# ------------------------------------------------------------
# Grid Search Loop
# ------------------------------------------------------------
results = []

for n_neighbors, min_cluster_size in itertools.product(n_neighbors_grid, min_cluster_size_grid):
    print(f"Training model with n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}")

    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        min_dist=0.0,
        metric="cosine",
        random_state=random_state
    )

    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=5,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=custom_vectorizer,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=True,
        verbose=True
    )

    topics, probs = topic_model.fit_transform(texts)
    n_topics = len(set(t for t in topics if t != -1))

    timestamp = datetime.datetime.now().strftime("%Y%m%d")
    model_name = f"bertopic_neighbors{n_neighbors}_cluster{min_cluster_size}_{timestamp}"
    topic_model.save(os.path.join(model_dir, model_name))

    results.append({
        "model_name": model_name,
        "n_neighbors": n_neighbors,
        "min_cluster_size": min_cluster_size,
        "n_topics": n_topics
    })

    print(f"Saved {model_name} with {n_topics} topics.")

In [None]:
# ------------------------------------------------------------
# Display Grid Search Results
# ------------------------------------------------------------
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("n_topics", ascending=False)
results_df.reset_index(drop=True, inplace=True)

results_df

In [None]:
# ------------------------------------------------------------
# Plot Results as Heatmap
# ------------------------------------------------------------
pivot_table = results_df.pivot(index="n_neighbors", columns="min_cluster_size", values="n_topics")
plt.figure(figsize=(8, 6))
sns.heatmap(pivot_table, annot=True, fmt="d", cmap="Blues")
plt.title("Number of Topics Found for Each Hyperparameter Combination")
plt.xlabel("min_cluster_size")
plt.ylabel("n_neighbors")
plt.tight_layout()
plt.show()



In [None]:
# ------------------------------------------------
# More fine-tuned scan around promising min_cluster_size
# values and exploring min_dists settings
# ------------------------------------------------
from itertools import product

# Parameters to scan
min_cluster_sizes = [25, 28, 30]
umap_min_dists = [0.0, 0.1, 0.2]
n_neighbors = 10

# Create an empty list to collect results
results = []

# Loop through combinations
for min_cluster_size, umap_min_dist in product(min_cluster_sizes, umap_min_dists):
    print(f"Training model with min_cluster_size={min_cluster_size}, umap_min_dist={umap_min_dist}")
    
    # Define UMAP and HDBSCAN models
    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        min_dist=umap_min_dist,
        metric="cosine",
        random_state=42
    )

    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=5,
        metric="euclidean",
        cluster_selection_method='eom',
        prediction_data=True
    )

    # Train BERTopic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=custom_vectorizer,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=True,
        verbose=True
    )
    
    topics, probs = topic_model.fit_transform(texts)
    n_topics = len(set(t for t in topics if t != -1))

    timestamp = datetime.datetime.now().strftime("%Y%m%d")
    model_name = f"bertopic_neighbors{n_neighbors}_cluster{min_cluster_size}_dist{umap_min_dist}_{timestamp}"
    topic_model.save(os.path.join(model_dir, model_name))
    
    # Record results
    results.append({
        "min_cluster_size": min_cluster_size,
        "umap_min_dist": umap_min_dist,
        "n_neighbors": n_neighbors,
        "n_topics": n_topics
    })

    print(f"Saved {model_name} with {n_topics} topics.")


In [None]:
# After all models are trained, create a DataFrame
results_df = pd.DataFrame(results)

# Sort results for easier reading
results_df = results_df.sort_values(by=["min_cluster_size", "umap_min_dist"]).reset_index(drop=True)

# Show the table
results_df

In [None]:
# ------------------------------------------------------------
# Plot Results as Heatmap
# ------------------------------------------------------------
pivot_table = results_df.pivot(index="umap_min_dist", columns="min_cluster_size", values="n_topics")
plt.figure(figsize=(8, 6))
sns.heatmap(pivot_table, annot=True, fmt="d", cmap="Blues")
plt.title("Number of Topics Found for Each Hyperparameter Combination")
plt.xlabel("min_cluster_size")
plt.ylabel("min_dist")
plt.tight_layout()
plt.show()