This file is reserved for running BERTopic on bacterial pneumonia to find subgroups associated with severity and clinical outcomes 

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Custom stopwords for structured format
structured_stopwords = [
    'is', 'an', 'anatomy', 'observation', 'modifies', 'located', 'at'
]

# Custom vectorizer for clinical terms
clinical_vectorizer = CountVectorizer(
    stop_words=structured_stopwords,
    ngram_range=(1, 3),
    max_features=500
)

# Medical-specific embedding model
medical_embedder = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

topic_model = BERTopic(
    embedding_model=medical_embedder,
    vectorizer_model=clinical_vectorizer,
    min_topic_size=5,
    nr_topics='auto',
    calculate_probabilities=True,
    verbose=True
)

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import torch
import os
# Load the embeddings
database = reports_df 

docs = reports_df['full_text'].tolist()

# UMAP model for dimensionality reduction
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric='cosine')

# HDBSCAN model for clustering
hdbscan_model = HDBSCAN(min_samples=20, 
                        gen_min_span_tree=True, 
                        prediction_data=True, 
                        min_cluster_size=5, 
                        metric='euclidean', 
                        cluster_selection_method='leaf')

# Create the CountVectorizer instance with the custom LemmaTokenizer
vectorizer_model = CountVectorizer(strip_accents='unicode', 
                                   stop_words='english', 
                                   ngram_range=(1, 3), 
                                   max_df=0.6) # remove general terms 
                                

# Step 1: Initialize custom c-TF-IDF model
ctfidf_model = ClassTfidfTransformer(
    bm25_weighting=False,             # BM25: Beyond a certain point, additional occurrences of a term don’t contribute as much to its weight, introducing a saturation effect. This is controlled by the k1 parameter.
    reduce_frequent_words=True,       # Reduce the impact of overly frequent words (True if needed)
)


# chain model: first extract the most relevant with KeyBERT, then prioritize diversity 
representation_model = [KeyBERTInspired(top_n_words=30, random_state=42), MaximalMarginalRelevance(diversity=0.7)]

# Initialize and fit BERTopic with probability calculation
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model, 
    vectorizer_model=vectorizer_model,
    verbose=True,
    representation_model=representation_model,
    top_n_words=30,
    calculate_probabilities=True,  # Ensure that BERTopic calculates probabilities
)

# Fit the model with documents and embeddings
# probabilities 
topics, probabilities = topic_model.fit_transform(docs, embeddings=embeddings)

# Create the directory if it does not exist
os.makedirs('../../models/NEWBERTopic', exist_ok=True)
os.makedirs('../../models/NEWBERTopic/representations/', exist_ok=True)
os.makedirs('../../models/NEWBERTopic/clusters/', exist_ok=True)

# Save the BERTopic model for later use as a directory 
# Save the ctf-idf matrix  for terms 
topic_model.save("../../models/NEWBERTopic/representations/specter_representation_keybert_mmr", serialization="pytorch", save_ctfidf=True)

# Get HDBSCAN cluster labels and probabilities
hdbscan_labels = topic_model.hdbscan_model.labels_
#The prediction_data=True enables the model to calculate probabilities by predicting how confidently each document fits into a cluster (fits and transforms)
hdbscan_probabilities = topic_model.hdbscan_model.probabilities_ # These probabilities tell you how confidently HDBSCAN assigns a document to its cluster (after fitting embeddings according to hdb model)

# Save HDBSCAN cluster labels and probabilities
torch.save(hdbscan_labels, '../../models/NEWBERTopic/clusters/hdbscan_labels_keybert_mmr.pt')
torch.save(hdbscan_probabilities, '../../models/NEWBERTopic/clusters/hdbscan_probabilities_keybert_mmr.pt')

## Manually save some files to the model directory 
# Save BERTopic probabilities (topic-patent / document-topic matrix) inside the model directory
torch.save(probabilities, '../../models/NEWBERTopic/representations/specter_representation_keybert_mmr/specter_probabilities_keybert_mmr.pt') # the actual document-topic probabilities that delineate association of each document to topic(s)

# Print the topic information after applying custom c-TF-IDF
topic_info = topic_model.get_topic_info()
torch.save(topic_info, '../../models/NEWBERTopic/representations/specter_representation_keybert_mmr/topic_info_keybert_mmr.pt')
topic_info.to_csv('../../models/NEWBERTopic/representations/specter_representation_keybert_mmr/topic_info.csv')
topic_info.to_excel('../../models/NEWBERTopic/representations/specter_representation_keybert_mmr/topic_info.xlsx')

# Save the representation and count vectorizer models to a .pt file
torch.save(representation_model, '../../models/NEWBERTopic/representations/specter_representation_keybert_mmr/representation_model.pt')
torch.save(vectorizer_model, '../../models/NEWBERTopic/representations/specter_representation_keybert_mmr/vectorizer_model.pt')