In [5]:
!pip install --quiet pandas openpyxl bertopic[visualization] sentence-transformers umap-learn hdbscan spacy scikit-learn
!python -m spacy download en_core_web_sm

[0mCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import pandas as pd

# Load the latest cleaned file
df = pd.read_excel("/content/cleaned_keywords_for_bertopic_v2.xlsx")

# Only fill NaNs in text columns (to avoid dtype errors)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna('')


In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

# Use the normalized combined text for modeling
df["preprocessed_text"] = df["combined_text_normalized"].apply(preprocess)

In [8]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Vectorizer: include unigrams + bigrams, with min_df for stability
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=5)

# UMAP: better topic preservation
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')

# HDBSCAN: tighter cluster size + outlier handling
hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric='euclidean',  # ✅ FIXED: use 'euclidean' here
    prediction_data=True,
    cluster_selection_method='eom'
)


# BERTopic setup
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    language="english",
    top_n_words=10,
    calculate_probabilities=True,
    verbose=True
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
docs = df["preprocessed_text"].tolist()
topics, probs = topic_model.fit_transform(docs)
df["Topic"] = topics


2025-04-26 07:36:45,852 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2025-04-26 07:38:07,733 - BERTopic - Embedding - Completed ✓
2025-04-26 07:38:07,734 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-26 07:38:20,706 - BERTopic - Dimensionality - Completed ✓
2025-04-26 07:38:20,708 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-26 07:38:20,767 - BERTopic - Cluster - Completed ✓
2025-04-26 07:38:20,794 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-26 07:38:21,048 - BERTopic - Representation - Completed ✓


In [10]:
# Save topic summary table
topic_model.get_topic_info().to_csv("topic_summary_final_v2.csv", index=False)

# Save document-topic assignment
df.to_excel("bertopic_clustered_output_final_v2.xlsx", index=False)


In [11]:
topic_model.visualize_topics()


In [12]:
# Rename for easier handling
df.rename(columns={"Times Cited, All Databases": "TC"}, inplace=True)

# Extract top 20 by citation count per topic
top20_df = df.groupby("Topic").apply(lambda g: g.nlargest(20, 'TC')).reset_index(drop=True)
top20_df.to_excel("top20_per_topic_final.xlsx", index=False)





In [15]:
topic_model.visualize_barchart(top_n_topics=7)


In [16]:
topic_model.visualize_heatmap(top_n_topics=7)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)
