# BERTopic + Llama for Topic Modeling on Cruncbase company descriptions
A straight-forward implementation of the Topic Modelling code in the repo
## Install dependencies

In [None]:
!pip install --no-cache-dir scikit-learn==1.5.0
!pip install --no-cache-dir cudf-cu12==24.6.0 dask-cudf-cu12==24.6.0 --extra-index-url=https://pypi.nvidia.com
!pip install --no-cache-dir cuml-cu12==24.6.0 --extra-index-url=https://pypi.nvidia.com
!pip install --no-cache-dir cugraph-cu12==24.6.0 --extra-index-url=https://pypi.nvidia.com
!pip install --no-cache-dir cupy-cuda12x==13.1.0 -f https://pip.cupy.dev/aarch64

!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
!pip install bertopic

!git clone https://github.com/TutteInstitute/datamapplot.git
!pip install datamapplot/.

!wget https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf

In [None]:
from llama_cpp import Llama

# Use llama.cpp to load in a Quantized LLM
llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])

In [None]:
from bertopic.representation import KeyBERTInspired, LlamaCPP

prompt = """ Q:
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, can you generate one to three classes of typical events in the given setting?
A:
"""

representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "LLM": LlamaCPP(llm, prompt=prompt),
}

## Read data

In [None]:
base_dir = "/content/drive/MyDrive/Potsdam/ClassMining"
data = "/data"
models = "/models"

In [None]:
import pandas as pd

organizations_filepath = f"{base_dir}{data}/organization_descriptions.csv"
descriptions = pd.read_csv(organizations_filepath, usecols=['description'])['description'].to_list()

print(f"Descriptions before cleaning: {len(descriptions)}")

# Remove empty strings and null elements
documents = [text for text in descriptions if text != '' and pd.notna(text)]

# Make sure each element is a string
documents = [str(text) for text in documents]

print(f"Descriptions after cleaning: {len(descriptions)}")

## Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

## Model

In [None]:
umap_model = UMAP(n_components=5, n_neighbors=20, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=30, prediction_data=True, cluster_selection_method='eom', min_cluster_size=400, verbose=True)

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(documents, embeddings)

In [None]:
# Show topics
topic_model.get_topic_info()