# BERTopic - Pneumonia Other Types 

In [1]:
from umap import UMAP
import pandas as pd 
from hdbscan import HDBSCAN
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import torch
import os
import numpy as np 
import ast

# Ensure 'embeddings' column is treated as a list of floats
def convert_to_array(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    except (ValueError, SyntaxError):
        return None  # Return None for rows with invalid embeddings

# Read the DataFrame
pneumonia_type_df = pd.read_csv('../NER_embeddings/pneumonia_type/radgraph_with_embeddings.csv')

# Filter the dataset to keep only bacterial pneumonia cases
other_df = pneumonia_type_df[pneumonia_type_df['pneumonia_type'] == 'other'].copy()

# Drop rows where 'radgraph_text' is NaN or empty
other_df = other_df[other_df['radgraph_text'].notna()]
other_df = other_df[other_df['radgraph_text'].str.strip() != '']

def convert_to_array(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    except (ValueError, SyntaxError):
        return None  # Return None for rows with invalid embeddings

other_df['embedding'] = other_df['embedding'].apply(convert_to_array)

# Drop rows where embeddings couldn't be converted
other_df = other_df.dropna(subset=['embedding'])

# Extract documents and embeddings
docs = other_df['radgraph_text'].astype(str).tolist()
embeddings = np.vstack(other_df['embedding'].values)
# Ensure embeddings shape is correct
print(f"Embeddings shape: {embeddings.shape}")  # Should be (num_docs, embedding_dim)

# Ensure shape consistency
assert len(docs) == embeddings.shape[0], "Mismatch between docs and embeddings!"
print(f"Filtered dataset size: {len(docs)}")


# Load the previously saved CXR model
model_path = "../../models/RADGRAPH_embeddings/bacterial"
tokenizer = AutoTokenizer.from_pretrained(model_path)
embedding_model = AutoModel.from_pretrained(model_path)


# UMAP model for dimensionality reduction
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric='cosine')

# HDBSCAN model for clustering
hdbscan_model = HDBSCAN(min_samples=20, 
                        gen_min_span_tree=True, 
                        prediction_data=True, 
                        min_cluster_size=5, 
                        metric='euclidean', 
                        cluster_selection_method='leaf')

# Create the CountVectorizer instance with the custom LemmaTokenizer
vectorizer_model = CountVectorizer(strip_accents='unicode', 
                                   stop_words='english', 
                                   ngram_range=(1, 3), 
                                   max_df=0.6) # remove general terms 
                                

# Step 1: Initialize custom c-TF-IDF model
ctfidf_model = ClassTfidfTransformer(
    bm25_weighting=False,             # BM25: Beyond a certain point, additional occurrences of a term don’t contribute as much to its weight, introducing a saturation effect. This is controlled by the k1 parameter.
    reduce_frequent_words=True,       # Reduce the impact of overly frequent words (True if needed)
)


# chain model: first extract the most relevant with KeyBERT, then prioritize diversity 
representation_model = [KeyBERTInspired(top_n_words=30, random_state=42), MaximalMarginalRelevance(diversity=0.7)]

# Initialize and fit BERTopic with probability calculation
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model, 
    vectorizer_model=vectorizer_model,
    verbose=True,
    representation_model=representation_model,
    top_n_words=10,
    calculate_probabilities=True,  # Ensure that BERTopic calculates probabilities
)

# Fit the model with documents and embeddings
# probabilities 
topics, probabilities = topic_model.fit_transform(docs, embeddings=embeddings)

# Create the directory if it does not exist
os.makedirs('../../models/BERTOPIC_Other_Pneumonia', exist_ok=True)
os.makedirs('../../models/BERTOPIC_Other_Pneumonia/representations/', exist_ok=True)
os.makedirs('../../models/BERTOPIC_Other_Pneumonia/clusters/', exist_ok=True)

# Save the BERTopic model for later use as a directory 
# Save the ctf-idf matrix  for terms 
topic_model.save("../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr", serialization="pytorch", save_ctfidf=True)

# Get HDBSCAN cluster labels and probabilities
hdbscan_labels = topic_model.hdbscan_model.labels_
#The prediction_data=True enables the model to calculate probabilities by predicting how confidently each document fits into a cluster (fits and transforms)
hdbscan_probabilities = topic_model.hdbscan_model.probabilities_ # These probabilities tell you how confidently HDBSCAN assigns a document to its cluster (after fitting embeddings according to hdb model)

# Save HDBSCAN cluster labels and probabilities
torch.save(hdbscan_labels, '../../models/BERTOPIC_Other_Pneumonia/clusters/hdbscan_labels_keybert_mmr.pt')
torch.save(hdbscan_probabilities, '../../models/BERTOPIC_Other_Pneumonia/clusters/hdbscan_probabilities_keybert_mmr.pt')

## Manually save some files to the model directory 
# Save BERTopic probabilities (topic-patent / document-topic matrix) inside the model directory
torch.save(probabilities, '../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr/specter_probabilities_keybert_mmr.pt') # the actual document-topic probabilities that delineate association of each document to topic(s)

# Print the topic information after applying custom c-TF-IDF
topic_info = topic_model.get_topic_info()
torch.save(topic_info, '../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr/topic_info_keybert_mmr.pt')
topic_info.to_csv('../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr/topic_info.csv')
topic_info.to_excel('../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr/topic_info.xlsx')

# Save the representation and count vectorizer models to a .pt file
torch.save(representation_model, '../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr/representation_model.pt')
torch.save(vectorizer_model, '../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr/vectorizer_model.pt')

Embeddings shape: (1654, 768)
Filtered dataset size: 1654


2025-02-12 14:52:21,740 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-02-12 14:52:26,876 - BERTopic - Dimensionality - Completed ✓
2025-02-12 14:52:26,876 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-12 14:52:26,951 - BERTopic - Cluster - Completed ✓
2025-02-12 14:52:26,954 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-12 14:52:40,422 - BERTopic - Representation - Completed ✓


In [7]:
import torch
from bertopic import BERTopic
from transformers import AutoTokenizer, AutoModel

# Load the previously saved CXR model
model_path = "../../models/RADGRAPH_embeddings/bacterial"
tokenizer = AutoTokenizer.from_pretrained(model_path)
embedding_model = AutoModel.from_pretrained(model_path)

topic_model = BERTopic.load('../../models/BERTOPIC_Other_Pneumonia/representations/specter_representation_keybert_mmr', embedding_model=embedding_model)
topics = topic_model.get_topic_info() 

In [8]:
topic_model.visualize_topics() 

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [9]:
topic_model.visualize_barchart(top_n_topics=100)