# Topic Modelling Notebook

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from bertopic import BERTopic
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from sklearn.metrics import davies_bouldin_score


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
import pandas as pd

# Load the file
path = r"" # path to npz file
data = np.load(path, allow_pickle=True)

print({k: data[k].shape for k in data.files})

embeddings = pd.DataFrame(data["embeddings"])
embeddings.columns = [f"emb_{i}" for i in range(embeddings.shape[1])]  # rename nicely
embeddings["text"] = data["texts"]
embeddings["label"] = data["labels"]

print(embeddings.shape)
print(embeddings.head())


{'embeddings': (11200, 384), 'texts': (11200,), 'labels': (11200,)}
(11200, 386)
      emb_0     emb_1     emb_2     emb_3     emb_4     emb_5     emb_6  \
0 -0.065954  0.129986 -0.027290  0.022978 -0.060881  0.020859 -0.097683   
1 -0.002379  0.061337  0.029013  0.004404 -0.013388 -0.011926 -0.017382   
2 -0.020936  0.029716  0.003486  0.019713 -0.003231  0.021080 -0.052675   
3 -0.100052  0.001816 -0.010475 -0.046205  0.034331 -0.054387  0.107253   
4 -0.011559 -0.028765  0.057984  0.075947  0.047227 -0.019874  0.011059   

      emb_7     emb_8     emb_9  ...   emb_376   emb_377   emb_378   emb_379  \
0  0.023718 -0.103246 -0.029989  ... -0.049650 -0.002975  0.014075 -0.105049   
1  0.044410 -0.059244 -0.088396  ... -0.055617  0.069392  0.007593  0.106782   
2  0.079946 -0.039729  0.032833  ... -0.047487  0.079180 -0.075348  0.085130   
3  0.026428  0.038723 -0.113006  ... -0.057479  0.006368 -0.001778  0.022304   
4  0.044840 -0.066399 -0.011631  ... -0.051420 -0.055166  0.038959  

In [17]:
texts = data["texts"]
labels = data["labels"]
emb_matrix = data["embeddings"]  # shape: (11200, 384)

train_df = pd.DataFrame({
    "text": texts,
    "label": labels,
    "embeddings": [vec.tolist() for vec in emb_matrix]
})

train_df

Unnamed: 0,text,label,embeddings
0,i was wondering if anyone out there could enli...,rec.autos,"[-0.06595448404550552, 0.1299862265586853, -0...."
1,a fair number of brave souls who upgraded thei...,comp.sys.mac.hardware,"[-0.0023794814478605986, 0.06133740022778511, ..."
2,well folks my mac plus finally gave up the gho...,comp.sys.mac.hardware,"[-0.020935537293553352, 0.029716189950704575, ..."
3,robert jc kyanko wrote do you have weiteks add...,comp.graphics,"[-0.10005185008049011, 0.0018155976431444287, ..."
4,from article by tom a baker my understanding i...,sci.space,"[-0.011558858677744865, -0.0287648793309927, 0..."
...,...,...,...
11195,dn from david nye dn a neurology dn consultati...,sci.med,"[-0.08230961114168167, -0.004875315818935633, ..."
11196,i have a very old mac 512k and a mac plus both...,comp.sys.mac.hardware,"[0.03424368426203728, -0.012400401756167412, 0..."
11197,i just installed a dx266 cpu in a clone mother...,comp.sys.ibm.pc.hardware,"[-0.02444905787706375, -0.01412092987447977, 0..."
11198,in article edward bolson writes wouldnt this r...,comp.graphics,"[0.04212794080376625, -0.06897043436765671, -0..."


# Topic Modeling 

## Init topic model

In [9]:
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
SEED = 42
from bertopic import BERTopic

# custom words to ignore
CUSTOM_STOPWORDS = []

# Pass your own preprocessor to remove them
def custom_preprocessor(text):
    text = text.lower()
    for w in CUSTOM_STOPWORDS:
        text = text.replace(w, " ")  # simple remove
    return text

umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=10,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)


vectorizer_model = CountVectorizer(
    stop_words="english",     # remove stopwords for topic representation
    preprocessor=custom_preprocessor,
    ngram_range=(1, 1),       # unigrams + bigrams (tweak if needed)
    min_df=1,                 # drop very rare terms
    max_df=1.0
)

topic_model = BERTopic(
    embedding_model=None,             
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    low_memory=True,
    calculate_probabilities=True,
    verbose=True,
    nr_topics=None,                   # keep all topics (no reduction)
)

print("BERTopic initialized with fixed random seed:", SEED)

BERTopic initialized with fixed random seed: 42


In [10]:
docs = embeddings["text"].tolist()
embeddings = embeddings[[f"emb_{i}" for i in range(embeddings.shape[1]-2)]].to_numpy()

topics, probs = topic_model.fit_transform(docs, embeddings)
print("Unique topics (excl. -1 outliers):", sorted(t for t in set(topics) if t != -1))


2025-10-20 14:30:29,856 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-20 14:31:00,074 - BERTopic - Dimensionality - Completed ✓
2025-10-20 14:31:00,076 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-20 14:31:07,792 - BERTopic - Cluster - Completed ✓
2025-10-20 14:31:07,792 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-20 14:31:09,542 - BERTopic - Representation - Completed ✓


Unique topics (excl. -1 outliers): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144]


In [None]:
train_df["Topic"] = topics
train_df["Topic_Probability"] = [max(p) if p is not None else None for p in probs]
train_df

In [11]:
topic_info = topic_model.get_topic_info()
topic_info.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4230,-1_writes_article_like_just,"[writes, article, like, just, use, dont, db, s...",[archivename spacegroups lastmodified date 930...
1,0,1123,0_team_game_season_games,"[team, game, season, games, hockey, play, play...",[the flyers closed out the season last night w...
2,1,450,1_key_encryption_clipper_chip,"[key, encryption, clipper, chip, privacy, keys...",[in article clipper chip announcement writes h...
3,2,316,2_israel_israeli_jews_arab,"[israel, israeli, jews, arab, arabs, lebanese,...",[in article tim clock writes in article brad h...
4,3,285,3_gun_guns_firearms_militia,"[gun, guns, firearms, militia, crime, firearm,...",[jim de arras wrote as a favorite sure as leth...


In [12]:
topic_model.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
topic_model.visualize_topics()

## Use Gemini to label topics 

In [None]:
import os
from dotenv import load_dotenv
import sys
from pathlib import Path

# Load .env from one directory up
load_dotenv(dotenv_path="../.env")

# Access your key
api_key = os.getenv("GEMINI_API_KEY")
print("GEMINI_API_KEY:", api_key[:6], "..." if api_key else "MISSING")


In [None]:
labels_df = label_topics_from_df(topic_info, api_key=api_key) 
final_df = topic_info.merge(labels_df, on="Topic", how="left")
display(final_df.head())

In [None]:
# join finetuned back to main dataframe
# plot number of each topic in each department
train_df = train_df.merge(labels_df[["Topic", "tuned_topic_name", "short_explanation"]], on="Topic", how="left")
train_df

# Testing on test set

In [None]:
# load topic model and test embeddings 

