# Install dependencies

In [1]:
!pip install datasets transformers sentence-transformers bertopic umap-learn hdbscan

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


# Load data

In [2]:
# Load data
from datasets import load_dataset

data = load_dataset("maartengr/arxiv_nlp")
data_train = data['train']
abstracts = data_train['Abstracts']
titles = data_train['Titles']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/617 [00:00<?, ?B/s]

data.csv:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
data_train

Dataset({
    features: ['Titles', 'Abstracts', 'Years', 'Categories'],
    num_rows: 44949
})

In [4]:
print(abstracts[0])
print(len(abstracts))

  In this paper Arabic was investigated from the speech recognition problem
point of view. We propose a novel approach to build an Arabic Automated Speech
Recognition System (ASR). This system is based on the open source CMU Sphinx-4,
from the Carnegie Mellon University. CMU Sphinx is a large-vocabulary;
speaker-independent, continuous speech recognition system based on discrete
Hidden Markov Models (HMMs). We build a model using utilities from the
OpenSource CMU Sphinx. We will demonstrate the possible adaptability of this
system to Arabic voice recognition.

44949


# Create Embeddings

In [5]:

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(list(abstracts), show_progress_bar=True)

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1405 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [6]:
print(embeddings.shape)

(44949, 384)


# Reduce dimensions using umap

In [7]:
from umap import UMAP
umap_model = UMAP(n_components=5, min_dist=0.0, metric="cosine", random_state=42)
reduced_embedding = umap_model.fit_transform(embeddings)

  warn(


# Create Clusters

In [8]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    metric="euclidean",
    cluster_selection_method="eom").fit(reduced_embedding)
clusters = hdbscan_model.labels_
print(len(set(clusters)))



155


# Topic Generation

In [9]:
# Topic models
from bertopic import BERTopic

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True).fit(abstracts,embeddings)

2025-08-03 03:27:17,590 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-03 03:28:11,281 - BERTopic - Dimensionality - Completed ✓
2025-08-03 03:28:11,283 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-03 03:28:13,512 - BERTopic - Cluster - Completed ✓
2025-08-03 03:28:13,525 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-03 03:28:18,583 - BERTopic - Representation - Completed ✓


In [10]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,14830,-1_the_of_and_to,"[the, of, and, to, in, we, for, language, that...",[ Word Representations form the core componen...
1,0,2203,0_question_qa_answer_questions,"[question, qa, answer, questions, answering, a...",[ With the development of deep learning techn...
2,1,1972,1_speech_asr_recognition_end,"[speech, asr, recognition, end, acoustic, spea...",[ End-to-end Speech Translation (ST) models h...
3,2,859,2_hate_offensive_speech_detection,"[hate, offensive, speech, detection, toxic, so...",[ With growing role of social media in shapin...
4,3,848,3_summarization_summaries_summary_abstractive,"[summarization, summaries, summary, abstractiv...",[ Pre-trained neural abstractive summarizatio...
...,...,...,...,...,...
150,149,53,149_counseling_mental_therapy_health,"[counseling, mental, therapy, health, psychoth...",[ Mental health care poses an increasingly se...
151,150,53,150_mixed_code_sentiment_mixing,"[mixed, code, sentiment, mixing, english, anal...",[ The usage of more than one language in the ...
152,151,53,151_prompt_prompts_optimization_prompting,"[prompt, prompts, optimization, prompting, llm...",[ Prompt optimization aims to find the best p...
153,152,50,152_long_context_window_length,"[long, context, window, length, llms, contexts...",[ Extending the context window of large langu...


In [11]:
topic_model.visualize_topics()

In [12]:
topic_model.visualize_barchart()

In [13]:
from copy import deepcopy
original_topics = deepcopy(topic_model.topic_representations_)

In [14]:
import pandas as pd

def topic_difference(model, original_topics, nr_topics=5):
  "Show differeence between topics to two models"
  df = pd.DataFrame(columns=["Topic", "Original", "Updated"])

  for topic in range(nr_topics):
    og_words = " | ".join(list(zip(*original_topics[topic]))[0][:5])
    new_words = " | ".join(list(zip(*model.get_topic(topic)))[0][:5])
    df.loc[len(df)] = [topic, og_words, new_words]
  return df

# Change topic representation

In [15]:
from bertopic.representation import KeyBERTInspired
representation_model = KeyBERTInspired()
topic_model.update_topics(abstracts, representation_model=representation_model)
topic_difference(topic_model, original_topics)

Unnamed: 0,Topic,Original,Updated
0,0,question | qa | answer | questions | answering,questions | answering | comprehension | questi...
1,1,speech | asr | recognition | end | acoustic,translation | speech | transcription | phoneti...
2,2,hate | offensive | speech | detection | toxic,hate | hateful | language | classifiers | twitter
3,3,summarization | summaries | summary | abstract...,summarization | summarizers | summaries | summ...
4,4,gender | bias | biases | debiasing | fairness,gender | gendered | bias | biases | biased


# Use LLMs to generate topics

In [16]:
from transformers import pipeline
from bertopic.representation import TextGeneration

prompt = """
I have a topic that contains following documents
[DOCUMENTS]
The topic is described by following keywords: '[KEYWORDS]'
Based on the documents and keywords, what is the topic about?
"""
llm = pipeline("text2text-generation", model="google/flan-t5-small")
representation_model = TextGeneration(llm,prompt=prompt,doc_length=50, tokenizer="whitespace")
topic_model.update_topics(abstracts, representation_model=representation_model)
topic_difference(topic_model, original_topics)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
  6%|▋         | 10/155 [00:02<00:24,  5.88it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 155/155 [00:54<00:00,  2.83it/s]


Unnamed: 0,Topic,Original,Updated
0,0,question | qa | answer | questions | answering,Question answering | | | |
1,1,speech | asr | recognition | end | acoustic,Speech-to-speech comparison metric | | | |
2,2,hate | offensive | speech | detection | toxic,hate speech | | | |
3,3,summarization | summaries | summary | abstract...,Document summarization | | | |
4,4,gender | bias | biases | debiasing | fairness,Gender bias in artificial intelligence and nat...
