# How to do cluster analysis with Bertopic

This notebook contains code to do a clustering analysis using Bertopic. 

In [22]:
# Remember to load kernel first 
# Ideally we could like to load a requirements text file with package versions but for this demo we will just install the latest versions
!pip install bertopic sentence-transformers pandas plotly numpy tqdm umap-learn hdbscan nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.9 MB/s[0m  [33m0:00:00[0mm eta [36m0:00:01[0m
[?25hDownloading click-8.3.1-py3-none-any.whl (108 kB)
Installing collected packages: click, nltk
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [nltk][32m1/2[0m [nltk]
[1A[2KSuccessfully installed click-8.3.1 nltk-3.9.2


In [None]:
# First, we initialize packages that we will use. 

from bertopic import BERTopic
from bertopic.representation import TextGeneration, MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
import pandas as pd
import plotly.express as px
import numpy as np
import re
import os
from pathlib import Path
import random
from tqdm import tqdm
from umap import UMAP
import hdbscan

# Directory where this script lives
script_dir = Path.cwd()
print(script_dir)
# Project root = parent of script dir (adjust as needed)
project_path = script_dir.parent
print(project_path)
# We also set a random state in order to make reproducible results
RANDOM_STATE = 1111
np.random.seed(RANDOM_STATE)


# embedding_model_name = "allenai-specter" <- english and specific for science, for multilingual use "paraphrase-multilingual-mpnet-base-v2"
embedding_model_name = "allenai-specter"
sentence_model = SentenceTransformer(embedding_model_name)

/Users/linnmattisson/Library/CloudStorage/OneDrive-SharedLibraries-ADC/Internal Projects - Bibliometrics/Bibliometric demo/scripts
/Users/linnmattisson/Library/CloudStorage/OneDrive-SharedLibraries-ADC/Internal Projects - Bibliometrics/Bibliometric demo


### Step 1: Load data
As a first step, we want to load our data, and possibly we also want to do some cleaning of the data. Bertopic uses large language models like BERT to produce the embeddings, and therefore we do not want to remove stopwords or similar here. However, sometimes we might want to remove HTML or other noisy stuff. In this example, we only load in the data and make a list of the documents. 

In [6]:
## Load the data
file_path = os.path.join(project_path, 'data','scopus_clean.csv')
print("Reading from:", file_path)
df = pd.read_csv(file_path)
df.columns = df.columns.str.replace('.', '_')


Reading from: /Users/linnmattisson/Library/CloudStorage/OneDrive-SharedLibraries-ADC/Internal Projects - Bibliometrics/Bibliometric demo/data/scopus_clean.csv


In [7]:
df.columns
df.head(1)
print(df.columns.tolist())

['Authors', 'Author_full_names', 'Author_s__ID', 'Title', 'Year', 'Source_title', 'Volume', 'Issue', 'Art__No_', 'Page_start', 'Page_end', 'Page_count', 'Cited_by', 'DOI', 'Link', 'Affiliations', 'Authors_with_affiliations', 'Abstract', 'Author_Keywords', 'Index_Keywords', 'Correspondence_Address', 'Editors', 'Publisher', 'ISSN', 'ISBN', 'CODEN', 'PubMed_ID', 'Language_of_Original_Document', 'Abbreviated_Source_Title', 'Document_Type', 'Publication_Stage', 'Open_Access', 'Source', 'EID', 'source_file', 'source_file_noext', 'application']


In [57]:
class ScopusCleaner:
    def __init__(
        self,
        df: pd.DataFrame,
        sep_token: str = "[SEP]",
        include_keywords_in_text: bool = True,
    ):
        """
        Parameters
        ----------
        df : pd.DataFrame
            Scopus export as a DataFrame. Must contain at least
            'title', 'abstract', 'subject_area' and optionally 'keywords'.
        sep_token : str
            Token used to separate title and abstract (e.g. tokenizer.sep_token).
        include_keywords_in_text : bool
            If True, append keywords to the text representation.
        """
        self.df = df.copy()
        self.sep_token = sep_token
        self.include_keywords_in_text = include_keywords_in_text

    def add_subject_area_to_article_df(self) -> pd.DataFrame:
        """
        Example placeholder: ensure there is a 'subject_area' column.
        Adapt this to however your subject areas are encoded.
        """
        if "subject_area" not in self.df.columns:
            # create an empty subject_area column if missing
            self.df["subject_area"] = None
        # do any cleaning / parsing here
        return self.df

    def get_title_abs_text(self) -> list[str]:
        """
        Return concatenated title + abstract (+ optional keywords) as a list of strings.
        """
        # Safely get columns (fall back to empty string if missing)
        title = self.df.get("title", pd.Series([""] * len(self.df))).fillna("")
        indexwords = self.df.get("Index_Keywords", pd.Series([""] * len(self.df))).fillna("")
        text = title + indexwords  



        return text.tolist()

In [58]:
sc = ScopusCleaner(
    df, 
    sep_token = sentence_model.tokenizer.sep_token,
    include_keywords_in_text=True,
)
df = sc.add_subject_area_to_article_df()

docs = sc.get_title_abs_text()

In [59]:
type(docs)
len(docs)

for i, d in enumerate(docs[:5]):
    print(f"--- doc {i} ---")
    print(d)
    print()



--- doc 0 ---


--- doc 1 ---
mitochondrial protein; P62 protein, human; parkin; polyubiquitin; RNA binding protein; sequestosome 1; signal transducing adaptor protein; ubiquitin protein ligase; autophagy; genetics; HeLa cell line; human; mitochondrion; mitophagy; ubiquitination; Adaptor Proteins, Signal Transducing; Autophagy; HeLa Cells; Humans; Mitochondria; Mitochondrial Proteins; Mitophagy; Polyubiquitin; RNA-Binding Proteins; Sequestosome-1 Protein; Ubiquitin-Protein Ligases; Ubiquitination

--- doc 2 ---
cathepsin S; CD40 ligand; gamma interferon; inflammasome; interleukin 12; interleukin 13; interleukin 18; interleukin 2; interleukin 4; thymic stromal lymphopoietin; cytokine; IL1B protein, human; inflammasome; interleukin 18; interleukin 18 protein, human; interleukin 1beta; antigen; blood; cancer; cell component; gene expression; immunity; memory; protein; tumor; antigen binding; antigen presenting cell; Article; autofluorescence; blood sampling; cancer staging; cancer surgery

### Step 2: Create embeddings

Secondly, we want to create embeddings from the text in the documents. Here, we use a pre-trained language model called `paraphrase-multilingual-MiniLM-L12-v2` which has a nice trade-off between size and speed. 

Note that the second cell that encodes the documents into embeddings takes some time to run. Therefore, we save the final embeddings in a numpy array, and later we can also load them in quickly without having to encode them again. 

In [60]:
# OBS: Only run this cell once - it takes some time to run...
# For the DEMO data, it takes about 10-15 minutes
embeds = sentence_model.encode(docs, show_progress_bar=True)
print(embeds.shape)

# File path embeddings saves it in the project outputs folder:
data_path = os.path.join(project_path, "data")
file_path_embeddings = os.path.join(data_path, "embeddings")

# Create folders if they don't exist
os.makedirs(file_path_embeddings, exist_ok=True)

np.save(
    os.path.join(file_path_embeddings, f"{embedding_model_name}-incl-keywords.npy"),
    embeds,
)

Batches: 100%|██████████| 474/474 [10:15<00:00,  1.30s/it]


(15139, 768)


In [61]:
# If you are starting from here, else skip this cell
data_path = os.path.join(project_path, "data")
file_path_embeddings = os.path.join(data_path, "embeddings", "allenai-specter-incl-keywords.npy")
embeds = np.load(file_path_embeddings)

### Step 3: Run and tune topic models

Now is the time to run and tune the parameters of the clustering model. Bertopic uses UMAP to reduce dimensionality of the embeddings, and then HDBSCAN to cluster the documents. 

As a default, Bertopic uses a variation of TF-IDF (c-TF-IDF) to interpret the clusters, but here we add our own CountVectorizer to interpret the clusters. It uses also bi-grams and a list of Danish stop-words to create names for the clusters. 

The preferred way to tune the parameters is to start with the default values as written here. Then, you can visualize and change one parameters at a time if you feel like it. Note that there is several models in play, and they can be very sensitive, so begin with small changes. Usually, the models are also varies a lot between random states, so you might also want to try out a couple of different random states to validate your results. 

You can read about the different parameters here: 
 - Bertopic: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html 
 - UMAP: https://umap-learn.readthedocs.io/en/latest/parameters.html 
 - HDBSCAN: https://hdbscan.readthedocs.io/en/latest/parameter_selection.html 

In [62]:
# Here the CountVectorizer is initialized. This is used for naming the clusters afterwards.
# You probably don't need to tune any parameters here. 
# with open('data/danish_stopwords.txt', 'r') as f:
#     danish_stop_words = f.readlines()
# danish_stop_words = [d.replace('\n', '') for d in danish_stop_words] + ['projekt','projektet','udvikle','demonstrere']

from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

# Base English stopwords
stopwords = stopwords.words("english")
print(stopwords[:20])     
print(stopwords[-10:])  
stopwords = stopwords + ['study', 'results', 'conclusion', 'method', 'methods', 'approach', 'paper', 'based', 'using', 'use', 'error', 'errata', 'erratum', 'journal', 'journal priority']
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1, 2))

 # *** Change parameters here ***
best_hp_dict = {
     "METRIC": "euclidean", # <- distance metric. We use the same in both UMAP and HDBSCAN

     # UMAP parameters
     "N_COMPONENTS": 2, # <- how many dimensions should we reduce to - for easy visualisation, we might want to choose 2
     #"N_NEIGHBORS": 10, 
     # "MIN_DIST": 0.0,

     # HDBSCAN parameters
     "HDBSCAN_MIN_CLUSTER_SIZE": 40, # <- the smallest size grouping that we want to consider a cluster
     "MIN_SAMPLES": 17, # <- how conservative do we want the clustering to be
     "CLUSTER_SELECTION_EPSILON": 0.2, # <- tradeoff between DBSCAN and HDBSCAN, higher values favors different size clusters

     # Bertopic parameters (irrelevant, since we add our own CountVectorizer)
     # "TOP_N_WORDS": xx,
     # "MIN_TOPIC_SIZE": xx,
 }





['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been']
['y', 'you', "you'd", "you'll", 'your', "you're", 'yours', 'yourself', 'yourselves', "you've"]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/linnmattisson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
# Here, we define the models with the parameters from best_hp_dict
# In the demo project this takes about 15 seconds

umap_model = UMAP(
    # n_components=best_hp_dict['N_COMPONENTS'], 
    n_components=2, 
    # metric=best_hp_dict['METRIC'],
    # n_neighbors=best_hp_dict['N_NEIGHBORS'],
    # min_dist=best_hp_dict['MIN_DIST'],
    random_state=RANDOM_STATE,
    low_memory=False
)

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=best_hp_dict["HDBSCAN_MIN_CLUSTER_SIZE"],
    min_samples=best_hp_dict["MIN_SAMPLES"],
    cluster_selection_epsilon=best_hp_dict["CLUSTER_SELECTION_EPSILON"],
    # metric=best_hp_dict['METRIC'],
    cluster_selection_method = 'eom',
    gen_min_span_tree=True,
    prediction_data=True
)



representation_model = MaximalMarginalRelevance(diversity=0.3)

topic_model = BERTopic(
    language="multilingual",
    vectorizer_model=vectorizer_model,
    # top_n_words=best_hp_dict["TOP_N_WORDS"],
    # min_topic_size=best_hp_dict["MIN_TOPIC_SIZE"],
    umap_model=umap_model,
    hdbscan_model=hdbscan_model, 
    representation_model=representation_model,
)

topics, probs = topic_model.fit_transform(docs, embeds) 

In [64]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28,-1____,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , ]"
1,0,14371,0_cell_protein_cancer_human,"[cell, protein, cancer, human, gene, tumor, ag...",[cell marker; cetuximab; cisplatin; dasatinib;...
2,1,514,1_priority priority_priority_priority article_...,"[priority priority, priority, priority article...","[erratum; error; priority journal, erratum; er..."
3,2,226,2_papillomavirus_human papillomavirus_papillom...,"[papillomavirus, human papillomavirus, papillo...",[cancer registry; cancer screening; cancer tis...


In [65]:
print(df.columns.tolist())

['Authors', 'Author_full_names', 'Author_s__ID', 'Title', 'Year', 'Source_title', 'Volume', 'Issue', 'Art__No_', 'Page_start', 'Page_end', 'Page_count', 'Cited_by', 'DOI', 'Link', 'Affiliations', 'Authors_with_affiliations', 'Abstract', 'Author_Keywords', 'Index_Keywords', 'Correspondence_Address', 'Editors', 'Publisher', 'ISSN', 'ISBN', 'CODEN', 'PubMed_ID', 'Language_of_Original_Document', 'Abbreviated_Source_Title', 'Document_Type', 'Publication_Stage', 'Open_Access', 'Source', 'EID', 'source_file', 'source_file_noext', 'application', 'subject_area', 'topics', 'probs', 'topics_name']


In [66]:
# Reduce outliers
#new_topics = topic_model.reduce_outliers(docs, topics)
#new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities")
new_topics = topic_model.reduce_outliers(docs, topics, strategy="distributions")
#new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf")

topics = new_topics

The difficult question is then: When we are confident with the model (and the parameters)? 
Our experience is that it requires a process of going back and forth between visualization of the UMAP projects colored with the clusters and re-consiring the parameters to make a clustering that seems to make sense. 

To guide the interpretation of the quality of the clusters, Bertopic have a number of nice and quick visualizations that is shown below. 

In the step 4, we make our own visualization that can also be used for further analysis - e.g. by comparing the linguistic clusters/embeddings with other background data. 

### Step 4: Visualization

When we are satisfied with our topic model, we want to visualize and analyze the clustering. Here, we first information about which document belong to which topic to the original dataframe. Then, we can plot or analyze how the clustering relates to other information in the data. 

In [67]:
# Make dict to map topic number to labels made by our CountVectorizer, Can change the word represnetation
label_dict = {}
for label in topic_model.generate_topic_labels():
    k,v = label.split('_', 1)
    label_dict[k] = v
    
# Add topics, probabilities, and topic names to original dataframe
df['topics'] = topics
df['probs'] = probs
df['topics_name'] = df['topics'].apply(lambda x: label_dict[str(x)])
df['topics_name'] = df['topics_name'].apply(lambda x: x.replace('_',' - '))
df['topics_name'] = df.apply(lambda row: 'Ikke Kategoriseret' if row['topics'] == -1 else row['topics_name'], axis=1)

print(df[['Title','topics','topics_name']].head(10))

# Note that cluster -1 i.e. 0 is the outlier cluster

# Print column naems as a list: 
print(df.columns.tolist())

                                               Title  topics  \
0  Jaw exercise therapy for the treatment of tris...       1   
1  K63-linked ubiquitylation induces global seque...       0   
2  Cytokines regulate the antigen-presenting char...       0   
3  The ETS transcription factor ETV5 is a target ...       0   
4  Identification of pre-diagnostic metabolic pat...       0   
5  Activation of ras signalling is associated wit...       0   
6  Quantifying DNA damage induced by ionizing rad...       0   
7  Mid-Infrared Imaging Is Able to Characterize a...       0   
8  18th Annual Meeting of the International Socie...       0   
9  Promotion of cell membrane fusion by cell-cell...       0   

                                       topics_name  
0  priority priority - priority - priority article  
1                          cell - protein - cancer  
2                          cell - protein - cancer  
3                          cell - protein - cancer  
4                          cel

In [68]:



def line_breaker(s: str) -> str:
    chunk_size = len(s)//15
    s_chunks = [ s[i:i+chunk_size] for i in range(0, len(s), chunk_size) ]
    return '<br>'.join(s_chunks)

# IF we chose n_components = 2 in the UMAP model, we can easily add them to the original df.
# If we chose more than 2 components, we run another UMAP reduce the embeddings to 2d for visualization. 
umap_model = topic_model.umap_model
temp = pd.DataFrame(umap_model.embedding_, columns=["x", "y"])
df_out = pd.concat([df,temp], axis=1)

print(df_out['topics_name'].nunique())
# Optional: We might want to save the final dataframe - so we don't need to run all the code if we want to do further analysis
# File path is output path:
file_path = os.path.join(project_path, "output")

df_out.to_csv(f"{file_path}/data_with_topics.csv", index=False)


4


In [None]:
#pd.options.display.max_columns = 99
#file_path = r'C:\Users\RickardSjöbergADC\Desktop\Project\Mistra_du1\GIT\mistra-du1\Klusting\All\data'
#df_out = pd.read_csv(f"{file_path}/data_with_topics.csv")
#df_out.shape

In [69]:
# File path plots saves it in the project outputs folder:
file_path_plots = os.path.join(project_path, "output")


In [71]:
print(df.head(10))

                                             Authors Author_full_names  \
0  Karlsson O., Karlsson T., Pauli N., Andréll P....               NaN   
1  Richard T.J.C., Herzog L.K., Vornberger J., Ra...               NaN   
2  Rao A., Strauss O., Kokkinou E., Bruchard M., ...               NaN   
3  Mus L.M., Lambertz I., Claeys S., Kumps C., Va...               NaN   
4  Jonsson P., Antti H., Späth F., Melin B., Björ...               NaN   
5  Rossitti H.M., Dutta R.K., Larsson C., Ghayee ...               NaN   
6  Singh V., Johansson P., Torchinsky D., Lin Y.-...               NaN   
7  Kontsek E., Pesti A., Björnstedt M., Üveges T....               NaN   
8                                  Moscoso-Castro M.               NaN   
9  Yoshihara A., Watanabe S., Goel I., Ishihara K...               NaN   

                                        Author_s__ID  \
0  55315176400;55607497200;55069673400;6507335277...   
1  56544149600;57193544131;57220869572;5722130075...   
2  57194379167;56

In [72]:
# ==== PREP CELL ====

import os
import webbrowser
from pathlib import Path

import plotly.express as px
import pandas as pd

# Assumptions:
# - One or both of these are already defined earlier in the notebook:
#     df_out        # pandas DataFrame
#     topic_model   # e.g. a BERTopic model
# - file_path_plots is a string or Path pointing to an existing / desired folder.
#   Example:
#   file_path_plots = "./plots"

# Ensure output directory exists
file_path_plots = Path(file_path_plots)  # make sure it's a Path object
file_path_plots.mkdir(parents=True, exist_ok=True)


def get_cluster_series(df_out=None, topic_model=None):
    """
    Return a pandas Series with cluster/topic labels for each row.
    Priority:
      1) df_out['topics_name']
      2) df_out['topic']
      3) topic_model document info (column 'Topic' or 'topic')
    """
    if df_out is not None:
        if "topics_name" in df_out.columns:
            return df_out["topics_name"]
        elif "topic" in df_out.columns:
            return df_out["topic"]

    if topic_model is not None:
        # Try to pull doc-level info from topic_model (e.g. BERTopic)
        doc_info = topic_model.get_document_info()
        if "topics_name" in doc_info.columns:
            return doc_info["topics_name"]
        elif "Topic" in doc_info.columns:
            return doc_info["Topic"]
        elif "topic" in doc_info.columns:
            return doc_info["topic"]

    raise ValueError(
        "Could not find a cluster/topic column. "
        "Expected 'topics_name' or 'topic' in df_out or topic_model.get_document_info()."
    )


def get_coordinates(df_out=None):
    """
    Return (x, y) Series for a map-like plot if df_out has them.
    Looks for columns 'x' and 'y'.
    """
    if df_out is not None and {"x", "y"}.issubset(df_out.columns):
        return df_out["x"], df_out["y"]
    return None, None


def save_and_open_html(fig, filename, folder=file_path_plots):
    """
    Save a Plotly figure as HTML in `folder` with `filename` and open it in a browser.
    """
    folder = Path(folder)
    folder.mkdir(parents=True, exist_ok=True)
    filepath = folder / filename
    fig.write_html(str(filepath), include_plotlyjs="cdn", full_html=True)
    webbrowser.open(f"file://{filepath.resolve()}")
    print(f"Saved and opened: {filepath}")

In [73]:
# ==== BAR CHART: NUMBER OF ROWS PER CLUSTER ====

# Get clusters from df_out or topic_model
cluster_series = get_cluster_series(
    df_out=df_out if "df_out" in globals() else None,
    topic_model=topic_model if "topic_model" in globals() else None,
)

# Build a DataFrame for counting
df_counts = (
    cluster_series
    .value_counts(dropna=False)
    .rename_axis("cluster")
    .reset_index(name="count")
)

# Optional: convert NaN to a label for plotting
df_counts["cluster"] = df_counts["cluster"].fillna("None / -1 / NaN")

fig_bar = px.bar(
    df_counts,
    x="cluster",
    y="count",
    title="Number of rows per cluster",
    labels={"cluster": "Cluster", "count": "Count"},
)

fig_bar.update_layout(xaxis_type="category")

# Show in notebook
fig_bar.show()

# Save and open as HTML
save_and_open_html(fig_bar, filename="cluster_counts_bar.html", folder=file_path_plots)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Saved and opened: /Users/linnmattisson/Library/CloudStorage/OneDrive-SharedLibraries-ADC/Internal Projects - Bibliometrics/Bibliometric demo/output/cluster_counts_bar.html


In [74]:
# ==== MAP-LIKE 2D CLUSTER VISUALISATION ====

# Get clusters (same helper as above)
cluster_series = get_cluster_series(
    df_out=df_out if "df_out" in globals() else None,
    topic_model=topic_model if "topic_model" in globals() else None,
)

# Try to get x,y from df_out
x, y = get_coordinates(df_out=df_out if "df_out" in globals() else None)

if x is not None and y is not None:
    # Case 1: We have 2D coordinates in df_out
    df_map = pd.DataFrame(
        {
            "x": x.values,
            "y": y.values,
            "cluster": cluster_series.values,
        }
    )

    fig_map = px.scatter(
        df_map,
        x="x",
        y="y",
        color="cluster",
        title="Map-like visualisation of clusters",
        labels={"x": "X", "y": "Y", "cluster": "Cluster"},
        opacity=0.8,
    )

    fig_map.update_traces(marker=dict(size=6))

    fig_map.show()
    save_and_open_html(fig_map, filename="cluster_map_scatter.html", folder=file_path_plots)

elif "topic_model" in globals() and topic_model is not None:
    # Case 2: No x,y in df_out; fall back to topic_model's own visualisation (e.g. BERTopic)
    # This assumes a BERTopic-like interface with visualize_documents(..) returning a Plotly figure.
    try:
        # You may want to pass specific documents/embeddings here depending on how your model is set up.
        fig_map = topic_model.visualize_documents()
        fig_map.show()
        save_and_open_html(fig_map, filename="cluster_map_topic_model.html", folder=file_path_plots)
    except Exception as e:
        raise RuntimeError(
            "Could not create map-like visualisation from topic_model. "
            "Make sure your model supports `visualize_documents()` or provide x,y in df_out."
        ) from e
else:
    raise RuntimeError(
        "No 2D coordinates found in df_out (columns 'x' and 'y' missing), "
        "and topic_model is not available or not usable for visualisation."
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Saved and opened: /Users/linnmattisson/Library/CloudStorage/OneDrive-SharedLibraries-ADC/Internal Projects - Bibliometrics/Bibliometric demo/output/cluster_map_scatter.html
