In [1]:
from llm_cgr import load_json, save_json
from datetime import datetime

In [2]:
dir = "../data/stackexchange"

## **1.** Query embeddings for the n-grams

In [3]:
ngrams_data = load_json(
    file_path=f"{dir}/ngrams_2025-07-04.json",
)
ngrams = list(ngrams_data.keys())

In [4]:
# get embeddings for the n-grams

from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    sentences=ngrams,
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/1114 [00:00<?, ?it/s]

In [5]:
# normalise embeddings (for cosine similarity)

embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [6]:
# combine n-grams with their embeddings

ngram_embeddings = dict(zip(ngrams, embeddings))
ngram_embeddings = {k: v.tolist() for k, v in ngram_embeddings.items()}

In [None]:
# save the n-grams embeddings data (saved locally but is too large to upload to GitHub)

save_json(
    data=ngram_embeddings,
    file_path=f"{dir}/ngram_embeddings_{datetime.now().date()}.json",
)

## **2.** Define a method to cluster the ngrams

* The method will take a list of ngrams, and print the clusters.
* It allows an iterative process, when determining which ngrams to include.

In [8]:
# load the n-gram, with their titles and embeddings
ngram_titles = load_json(
    file_path=f"{dir}/ngrams_2025-07-04.json",
)
ngram_embeddings = load_json(
    file_path=f"{dir}/ngram_embeddings_2025-07-04.json",
)

In [27]:
# method to take list of n-grams and cluster them based on their embeddings

from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict


def cluster_ngrams(
    ngrams: list[str],
    distance_threshold: float = 0.5,  # e.g., merge if cosine distance < 0.5 (i.e. similarity > 0.5)
) -> list[tuple[int, list[str]]]:
    embeddings = [ngram_embeddings[ng] for ng in ngrams]

    # compute clusters
    clustering = AgglomerativeClustering(
        n_clusters=None,
        metric="cosine",
        linkage="average",
        distance_threshold=distance_threshold,
    )
    labels = clustering.fit_predict(embeddings)
    print(f"Number of clusters formed: {len(set(labels))}\n")

    # map the cluster labels to the n-grams
    cluster_to_terms = defaultdict(list)
    for term, lbl in zip(ngrams, labels):
        cluster_to_terms[lbl].append(term)

    # compute cluster “popularity” = sum of unique titles of its terms
    cluster_to_questions = {}
    for lbl, terms in cluster_to_terms.items():
        _title_ids = set()
        for term in terms:
            _title_ids.update(ngram_titles[term])
        cluster_to_questions[lbl] = _title_ids

    # sort clusters by popularity descending
    sorted_clusters = sorted(
        cluster_to_terms.keys(),
        key=lambda x: len(cluster_to_questions[x]),
        reverse=True,
    )

    # display results
    clusters = {}
    for rank, (lbl) in enumerate(sorted_clusters, start=1):
        print(
            f"{rank}. Cluster {lbl} ({len(cluster_to_questions[lbl])} titles, {len(cluster_to_terms[lbl])} n-grams):"
        )
        print("  ", cluster_to_terms[lbl])
        clusters[rank] = {
            "class": None,  # placeholder for class
            "terms": sorted(cluster_to_terms[lbl]),
            "questions": sorted(cluster_to_questions[lbl]),
        }

    return clusters

## **3.** Determine which ngrams to use for clustering

In [17]:
# list the ngrams by themselves
ngrams = list(ngram_titles.keys())

In [18]:
# manually defined lists of terms to filter out or allow

# technical nouns that affect the clustering too much
# fmt: off
DENY_LIST = [
    # generic terms
    "library", "framework", "libraries", "code", "image",
    "images", "file", "files", "online", "audio",
    "graphics", "video", "interactive", "markup", "tool",
    "software", "app", "application", "reference", "data",
    "compatible", "compatibility", "studio", "visual",
    # programming languages
    "python", "c++", "c", "java", "c#",
    "javascript", "golang", "visual basic", "pascal", "fortran",
    "ada", "sql", "perl", "r", "php",
    "scratch", "matlab", "rust", "assembly", "cobol",
    "classic visual basic", "prolog", "kotlin", "ruby", "swift",
    "sas", "lisp", "dart", "haskell", "lua",
    "scala", "vbscript", "julia", "objective-c", "foxpro",
    "gams", "typescript", "pl/sql", "abap", "v",
    "solidity", "d", "bash", "powershell", "ml",
    "elixir", "awk", "x++", "labview", "erlang",
    "js", "c/c++",  # language related terms
    # file types
    "pdf", "html", "css", "markdown", "json",
    # technologies and platforms
    "android", ".net", "windows", "database", "key",
    "flask", "spring", "laravel", "django", "react",
    "angular", "vue", "node.js",
]
# fmt: on

# technical adjectives that are missed by the word-type tagging
ALLOW_LIST = [
    "open",
    "source",
    "lightweight",
    "userfriendly",
    "friendly",
    "new",
    "alternative",
    "efficient",
    "modern",
]

In [19]:
# download the model with `python -m spacy download en_core_web_sm` and then load it

import spacy

nlp = spacy.load(
    name="en_core_web_sm",
    disable=["parser", "ner"],
)

In [28]:
def is_descriptive(ngram: str) -> bool:
    doc = nlp(text=ngram)
    has_adj = False
    for tok in doc:
        if tok.text in DENY_LIST:
            return False
        if tok.pos_ == "PROPN":
            return False
        if tok.pos_ == "ADJ" or tok.pos_ == "ADV" or tok.text in ALLOW_LIST:
            has_adj = True
    return has_adj


_ngrams = [ng for ng in ngrams if is_descriptive(ng)]

cluster_data = cluster_ngrams(
    ngrams=_ngrams,
    distance_threshold=0.5,
)

Number of clusters formed: 1418

1. Cluster 1345 (174 titles, 6 n-grams):
   ['big source', 'comments source', 'source', 'source help', 'source paid', 'source works']
2. Cluster 167 (149 titles, 21 n-grams):
   ['best open', 'deep learning open', 'development open', 'fast open', 'free open', 'good open', 'learning open', 'like open', 'looking good open', 'looking open', 'need good open', 'need open', 'need suggestion open', 'open', 'open ended quot', 'open read', 'recommended open', 'running open', 'seeking open', 'suggestion open', 'use open']
3. Cluster 12 (145 titles, 68 n-grams):
   ['based open source', 'best open source', 'closed source', 'closed source project', 'commercial open source', 'conform open source', 'development open source', 'easy learn freeware', 'fast open source', 'flexible open source', 'free open source', 'free softwares', 'free softwares make', 'good open source', 'independent open source', 'learning open source', 'light open source', 'like open source', 'list 

In [30]:
# save the clustered n-grams data for clusters of questions larger than 20

save_json(
    data={k: v for k, v in cluster_data.items() if len(v["questions"]) > 20},
    file_path=f"{dir}/clusters_{datetime.now().date()}.json",
)

## **4.** Classify the clusters



In [None]:
# load the classified clusters data

cluster_data = load_json(
    file_path=f"{dir}/clusters_2025-07-06.json",
)

In [None]:
# combine counts from clusters with the same class

class_questions = defaultdict(set)

for _, _data in cluster_data.items():
    if _data["class"] is not None:
        class_questions[_data["class"]].update(_data["questions"])

In [36]:
# display the final classes and counts

_sorted_classes = sorted(
    class_questions.keys(),
    key=lambda x: len(class_questions[x]),
    reverse=True,
)

print("Classifications for prompts:")
for _class in _sorted_classes:
    print(f"{_class} ({len(class_questions[_class])} questions)")

Classifications for prompts:
open source (184 questions)
alternative (149 questions)
best (93 questions)
simple (77 questions)
easy (56 questions)
fast (56 questions)
lightweight (30 questions)
modern (22 questions)
