### Importing Libraries and Data

In [None]:
!pip install numpy
!pip install scikit-learn
!pip install gensim
!pip install pandas
!pip install openai
!pip install seaborn
!pip install nltk
!pip install bertopic
!pip install sentence-transformers
!pip install umap-learn
!pip install futures
!pip install tqdm
!pip install pickle5
!pip install hdbscan
!pip install datashader bokeh holoviews scikit-image colorcet

import sklearn.cluster as skc
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy import stats
from scipy.cluster.hierarchy import dendrogram, leaves_list

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patheffects as pe
from matplotlib.gridspec import GridSpec

import numpy as np
import pandas as pd
import re
import seaborn as sns
import time
import gensim
from collections.abc import Mapping
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

import sklearn

import pickle as pickle

from transformers import BertModel, BertTokenizer
import torch
import dask.dataframe as dd
import umap as umap
import hdbscan as hdbscan
import umap.plot as upplot

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load PubMedBERT model
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
model = SentenceTransformer(model_name)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/All_topic_model/final_merged_file.csv')

  data = pd.read_csv('/content/drive/MyDrive/All_topic_model/final_merged_file.csv')


In [None]:
data.shape

(2292732, 49)

### data filtering


In [None]:
data.keys()

Index(['APPLICATION_ID', 'ABSTRACT_TEXT', 'ACTIVITY', 'ADMINISTERING_IC',
       'APPLICATION_TYPE', 'ARRA_FUNDED', 'AWARD_NOTICE_DATE', 'BUDGET_START',
       'BUDGET_END', 'CFDA_CODE', 'CORE_PROJECT_NUM', 'ED_INST_TYPE',
       'FOA_NUMBER', 'FULL_PROJECT_NUM', 'SUBPROJECT_ID', 'FUNDING_ICs', 'FY',
       'IC_NAME', 'NIH_SPENDING_CATS', 'ORG_CITY', 'ORG_COUNTRY', 'ORG_DEPT',
       'ORG_DISTRICT', 'ORG_DUNS', 'ORG_FIPS', 'ORG_NAME', 'ORG_STATE',
       'ORG_ZIPCODE', 'PHR', 'PI_IDS', 'PI_NAMEs', 'PROGRAM_OFFICER_NAME',
       'PROJECT_START', 'PROJECT_END', 'PROJECT_TERMS', 'PROJECT_TITLE',
       'SERIAL_NUMBER', 'STUDY_SECTION', 'STUDY_SECTION_NAME', 'SUFFIX',
       'SUPPORT_YEAR', 'TOTAL_COST', 'TOTAL_COST_SUB_PROJECT',
       'OPPORTUNITY NUMBER', 'FUNDING_MECHANISM', 'ORG_IPF_CODE',
       'DIRECT_COST_AMT', 'INDIRECT_COST_AMT', 'FUNDING_Ics'],
      dtype='object')

In [None]:
data = data[data['ABSTRACT_TEXT'].notna()]
data = data[data['ADMINISTERING_IC'] == 'CA']
data = data[data['ACTIVITY'].notna()]
data = data[data['ACTIVITY'].str.startswith('R')]
data = data[data['ACTIVITY'] != 'R25']
data = data[data['ACTIVITY'] != 'R13']
data = data[data['ACTIVITY'] != 'R56']
data = data[data['FY'] >= 2000]
data = data[data['FY'] < 2021]
data = data[data['TOTAL_COST'].notna()]
data = data[data['TOTAL_COST'] > 0]
data = data[data['TOTAL_COST'] > 1]
data.reset_index(drop=True, inplace=True)
data.shape

(95655, 49)

In [None]:
total_cost_null = data['TOTAL_COST'].isnull().sum()
total_cost_null
total_cost_null_percent = (total_cost_null/len(data))*100
total_cost_null_percent

0.0

## creating corpus


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pickle

mystop = ['aim', 'aims', 'description', 'provided', 'applicant', 'project', 'overall', 'summary', 'abstract', 'goal', "'s",
          'summary/abstract', 'proposals', 'subproject', 'objective', 'results', 'methods', 'conclusion', 'goal', 'study', 'by',
          'objective', 'public', 'health', 'relevance', 'common', '(', ')', ':', '.', ',', 'those', 'often', 'same', 'via',
          'taken', 'although', 'versus', 'following', 'instead', 'another', 'also', 'non', 'may', 'thus', 'could', 'therefore',
          'without', 'even', 'much', 'among', 'co', 'yet', 'still', 'ex', 'toward', 'hence', 'around', 'beyond', 'almost',
          'outside', 'might', 'inside', 'etc', 'ever', 'whereby', 'intra', 'like', 'ii', 'iii', 'unreadable']

stop = stopwords.words('english') + list(string.punctuation) + mystop

clean_text = []
tokenized_text = []

len_threshold = 50

for i, ab in enumerate(data['ABSTRACT_TEXT']):
    ab = ab.replace('\\xa0', ' ').replace('\\xad', '')
    clean = word_tokenize(ab.lower())
    if len(clean) < len_threshold:
        data = data[data['ABSTRACT_TEXT'] != ab]
    else:
        clean = [tok for tok in clean if tok not in stop]
        clean_text.append(' '.join(clean))
        tokenized_text.append(clean)

data['Tokenized_Text'] = tokenized_text
data['Clean_Text'] = clean_text

# Save using pickle
with open('data.pickle', 'wb') as f:
    pickle.dump(data['Clean_Text'], f)

with open('stop.pickle', 'wb') as g:
    pickle.dump(stop, g)

In [None]:
#load from pickle
import pickle

with open('data.pickle', 'rb') as f:
    loaded_data = pickle.load(f)

with open('stop.pickle', 'rb') as g:
    loaded_stop = pickle.load(g)

In [None]:
corpus = loaded_data.tolist()

In [None]:
# # run once, save embeddings in a file

# from sentence_transformers import SentenceTransformer
# from sklearn.feature_extraction.text import CountVectorizer
# import umap.umap_ as umap
# from tqdm import tqdm
# import numpy as np

# docs = data['ABSTRACT_TEXT'].tolist()

# # Use a batch size to encode documents in batches
# sentence_model = SentenceTransformer("neuml/pubmedbert-base-embeddings")

# # Define a function to encode documents in batches with progress bar
# def encode_batch(docs, batch_size=32):
#     embeddings = []
#     for i in tqdm(range(0, len(docs), batch_size), desc="Encoding Batches"):
#         batch = docs[i:i+batch_size]
#         embeddings.extend(sentence_model.encode(batch, show_progress_bar=False))
#     return np.array(embeddings)

# # Encode documents in batches
# embeddings = encode_batch(corpus)

# # Save embeddings to a file
# file_path = '/content/drive/MyDrive/All_topic_model/embeddings.npy'

# with open(file_path, 'wb') as f:
#     np.save(f, embeddings)

## load in the embedding files


In [None]:
embeddings = np.load('/content/drive/MyDrive/All_topic_model/embeddings.npy', allow_pickle=True)

In [None]:
embeddings.shape

(95606, 768)

## BERTopic with external embeddings and CORPUS

In [None]:
len(corpus)

95606

In [None]:
embeddings.shape

(95606, 768)

In [None]:
!pip install --upgrade bertopic



### **BERTopic with HDBSCAN + keyBERTinspired Representation**

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired


# Load the SentenceTransformer model
embedding_model = SentenceTransformer("neuml/pubmedbert-base-embeddings")

# Define UMAP model
umap_model = UMAP(n_components=25, n_neighbors=30, min_dist=0.0, random_state=42, metric="manhattan", verbose=True)

# Define HDBSCAN model with adjusted parameters
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True, min_cluster_size=50)

# Define the vectorizer model with updated parameters
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=1,  # Lower this to include terms that appear in fewer documents
    max_features=10000,  # Increase this to include more words
    ngram_range=(1, 3)  # Keep the range for capturing multi-word terms
)

# Define the representation model
representation_model = KeyBERTInspired()

# Initialize BERTopic with precomputed embeddings
topic_model_HDB_key = BERTopic(
    embedding_model = embedding_model,
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    top_n_words=9,  # Increase the number of top words to consider per topic
    verbose=True,
    nr_topics=101, # Target number of topics
    min_topic_size = 50,
    calculate_probabilities = True
    )

# Fit the topic model with precomputed embeddings
topics, probs = topic_model_HDB_key.fit_transform(corpus, embeddings = embeddings)

# Get topic information
topic_info_HDB_key = topic_model_HDB_key.get_topic_info()
print(topic_info_HDB_key)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2024-09-11 05:00:03,582 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(metric='manhattan', min_dist=0.0, n_components=25, n_jobs=1, n_neighbors=30, random_state=42, verbose=True)
Wed Sep 11 05:00:03 2024 Construct fuzzy simplicial set
Wed Sep 11 05:00:03 2024 Finding Nearest Neighbors
Wed Sep 11 05:00:03 2024 Building RP forest with 20 trees
Wed Sep 11 05:00:15 2024 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	 4  /  17
	 5  /  17
	Stopping threshold met -- exiting after 5 iterations
Wed Sep 11 05:01:17 2024 Finished Nearest Neighbor Search
Wed Sep 11 05:01:21 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Sep 11 05:04:20 2024 Finished embedding


2024-09-11 05:04:21,420 - BERTopic - Dimensionality - Completed ✓
2024-09-11 05:04:21,432 - BERTopic - Cluster - Start clustering the reduced embeddings
  pid = os.fork()
2024-09-11 05:19:39,696 - BERTopic - Cluster - Completed ✓
2024-09-11 05:19:39,698 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-11 05:22:33,678 - BERTopic - Representation - Completed ✓
2024-09-11 05:22:33,680 - BERTopic - Topic reduction - Reducing number of topics
2024-09-11 05:24:40,304 - BERTopic - Topic reduction - Reduced number of topics from 416 to 101


     Topic  Count                                               Name  \
0       -1  25453     -1_cancer_cancers_cancer cells_prostate cancer   
1        0   2948       0_tumor immunity_immunotherapy_antitumor_ctl   
2        1   2932  1_p53_p53 function_p53 mediated_p53 tumor supp...   
3        2   2639  2_risk breast cancer_breast cancer risk_breast...   
4        3   2475  3_prostate cancer progression_prostate cancer ...   
..     ...    ...                                                ...   
96      95     58       95_myb_hematopoiesis_hematopoietic_leukemias   
97      96     56  96_cancer chemoprevention_chemopreventive_chem...   
98      97     55           97_dna adducts_dna adduct_adducts_adduct   
99      98     52     98_toxicity_chemotherapeutic_prodrug_compounds   
100     99     50  99_tnf alpha_tumor necrosis factor_tnf_necrosi...   

                                        Representation  \
0    [cancer, cancers, cancer cells, prostate cance...   
1    [tumor immunit

In [None]:
# Reduce outliers
new_topics_HDB_key = topic_model_HDB_key.reduce_outliers(corpus, topics, probabilities=probs, strategy="probabilities")

In [None]:
topic_model_HDB_key.update_topics(corpus, topics=new_topics_HDB_key, n_gram_range=(1, 3))
topic_model_HDB_key.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,8047,0_tumor_cells_immune_cell,"[tumor, cells, immune, cell, specific, anti, a...",[long-term develop vaccine strategies lead act...
1,1,3016,1_p53_myc_mdm2_cyclin,"[p53, myc, mdm2, cyclin, cell, function, prote...",[adapted investigator p53 protein plays critic...
2,2,4223,2_breast_breast cancer_cancer_risk,"[breast, breast cancer, cancer, risk, women, e...",[longterm work advance breast cancer preventio...
3,3,4075,3_prostate_prostate cancer_ar_androgen,"[prostate, prostate cancer, ar, androgen, canc...",[prostate cancer male malignancy takes lives 3...
4,4,4941,4_dna_repair_damage_dna damage,"[dna, repair, damage, dna damage, brca1, repli...",[dna mismatch repair mmr system essential main...
...,...,...,...,...,...
95,95,58,95_myb_myb myb_muvb_myb protein,"[myb, myb myb, muvb, myb protein, hematopoieti...",[provide v-myb oncogene causes acute monoblast...
96,96,56,96_oral_oral cancer_ginger_brb,"[oral, oral cancer, ginger, brb, oscc, chemopr...",[oral squamous cell carcinoma oscc cancer head...
97,97,55,97_adducts_dna adducts_dna_leg,"[adducts, dna adducts, dna, leg, adduct, carci...",[exposure wide variety carcinogens leads forma...
98,98,52,98_cpt 11_cpt_11_sn,"[cpt 11, cpt, 11, sn, sn 38, hice, 38, diarrhe...",[understand cpt-11 processing metabolism atomi...


In [None]:
# Customizing the topic names
def custom_topic_names(model, num_words=12):
    custom_labels = {}
    for topic in model.get_topic_freq().Topic:
        if topic == -1:
            continue
        # Get the most representative words for the topic
        words = model.get_topic(topic)[:num_words]
        # Create a custom label with the specified number of words
        custom_label = " ".join([word[0] for word in words])
        custom_labels[topic] = custom_label
    return custom_labels

# Apply custom labels with a limited number of words
topic_model_HDB_key.set_topic_labels(custom_topic_names(topic_model_HDB_key, num_words=12))

In [None]:
topic_model_HDB_key.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,0,8047,0_tumor_cells_immune_cell,tumor cells immune cell specific anti antigen ...,"[tumor, cells, immune, cell, specific, anti, a...",[long-term develop vaccine strategies lead act...
1,1,3016,1_p53_myc_mdm2_cyclin,p53 myc mdm2 cyclin cell function protein tumo...,"[p53, myc, mdm2, cyclin, cell, function, prote...",[adapted investigator p53 protein plays critic...
2,2,4223,2_breast_breast cancer_cancer_risk,breast breast cancer cancer risk women er estr...,"[breast, breast cancer, cancer, risk, women, e...",[longterm work advance breast cancer preventio...
3,3,4075,3_prostate_prostate cancer_ar_androgen,prostate prostate cancer ar androgen cancer pc...,"[prostate, prostate cancer, ar, androgen, canc...",[prostate cancer male malignancy takes lives 3...
4,4,4941,4_dna_repair_damage_dna damage,dna repair damage dna damage brca1 replication...,"[dna, repair, damage, dna damage, brca1, repli...",[dna mismatch repair mmr system essential main...
...,...,...,...,...,...,...
95,95,58,95_myb_myb myb_muvb_myb protein,myb myb myb muvb myb protein hematopoietic myb...,"[myb, myb myb, muvb, myb protein, hematopoieti...",[provide v-myb oncogene causes acute monoblast...
96,96,56,96_oral_oral cancer_ginger_brb,oral oral cancer ginger brb oscc chemopreventi...,"[oral, oral cancer, ginger, brb, oscc, chemopr...",[oral squamous cell carcinoma oscc cancer head...
97,97,55,97_adducts_dna adducts_dna_leg,adducts dna adducts dna leg adduct carcinogens...,"[adducts, dna adducts, dna, leg, adduct, carci...",[exposure wide variety carcinogens leads forma...
98,98,52,98_cpt 11_cpt_11_sn,cpt 11 cpt 11 sn sn 38 hice 38 diarrhea ce iri...,"[cpt 11, cpt, 11, sn, sn 38, hice, 38, diarrhe...",[understand cpt-11 processing metabolism atomi...


In [None]:
def custom_topic_names(model, num_words=12):
    custom_labels = {}
    all_used_words = set()  # Keep track of words already used in labels

    for topic in model.get_topic_freq().Topic:
        if topic == -1:
            continue

        # Get the most representative words for the topic
        words = model.get_topic(topic)

        # Filter out words already used in other labels
        filtered_words = [word for word in words if word[0] not in all_used_words]

        # Take up to num_words, prioritizing unused words
        label_words = filtered_words[:num_words]

        # Create a custom label
        custom_label = " ".join([word[0] for word in label_words])
        custom_labels[topic] = custom_label

        # Update the set of used words
        all_used_words.update([word[0] for word in label_words])

    return custom_labels

# Apply custom labels with minimized duplicates
topic_model_HDB_key.set_topic_labels(custom_topic_names(topic_model_HDB_key, num_words=12))
topic_model_HDB_key.get_topic_info()


Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,0,8047,0_tumor_cells_immune_cell,tumor cells immune cell specific anti antigen ...,"[tumor, cells, immune, cell, specific, anti, a...",[long-term develop vaccine strategies lead act...
1,1,3016,1_p53_myc_mdm2_cyclin,p53 myc mdm2 cyclin function protein role,"[p53, myc, mdm2, cyclin, cell, function, prote...",[adapted investigator p53 protein plays critic...
2,2,4223,2_breast_breast cancer_cancer_risk,breast breast cancer risk women er estrogen ca...,"[breast, breast cancer, cancer, risk, women, e...",[longterm work advance breast cancer preventio...
3,3,4075,3_prostate_prostate cancer_ar_androgen,prostate prostate cancer ar androgen pca men p...,"[prostate, prostate cancer, ar, androgen, canc...",[prostate cancer male malignancy takes lives 3...
4,4,4941,4_dna_repair_damage_dna damage,dna repair damage dna damage brca1 replication...,"[dna, repair, damage, dna damage, brca1, repli...",[dna mismatch repair mmr system essential main...
...,...,...,...,...,...,...
95,95,58,95_myb_myb myb_muvb_myb protein,myb myb myb muvb myb protein myb proteins myb ...,"[myb, myb myb, muvb, myb protein, hematopoieti...",[provide v-myb oncogene causes acute monoblast...
96,96,56,96_oral_oral cancer_ginger_brb,oral cancer ginger brb oscc lbr oral carcinoge...,"[oral, oral cancer, ginger, brb, oscc, chemopr...",[oral squamous cell carcinoma oscc cancer head...
97,97,55,97_adducts_dna adducts_dna_leg,adducts dna adducts leg adduct carcinogens ms ...,"[adducts, dna adducts, dna, leg, adduct, carci...",[exposure wide variety carcinogens leads forma...
98,98,52,98_cpt 11_cpt_11_sn,cpt 11 cpt 11 sn sn 38 hice 38 diarrhea ce iri...,"[cpt 11, cpt, 11, sn, sn 38, hice, 38, diarrhe...",[understand cpt-11 processing metabolism atomi...


In [None]:
topic_model_HDB_key.visualize_topics()


In [None]:
# Get the representative documents for each topic
representative_docs = topic_model_HDB_key.get_representative_docs()

# Extract the indices of the representative documents and filter out -1
representative_doc_indices = [int(str(doc).split('_')[1]) if isinstance(doc, str) and doc != '-1' else doc for doc in representative_docs]
representative_doc_indices = [doc for doc in representative_doc_indices if doc != -1]

# Get the TOTAL_COST values for the representative documents
representative_costs = data.loc[representative_doc_indices, 'TOTAL_COST'].values

# Print the TOTAL_COST values
print(representative_costs)


[408077. 632118.  96750. 502390. 383130. 614606. 143964. 129000. 545795.
 304150. 399570.  65350. 228008.  72750. 118028. 328313. 297500. 296820.
 307123. 433558. 382000. 288240. 806869. 241740. 285132. 231815. 100000.
 295263. 337334. 318750. 540466. 237088. 400349. 232013. 330615. 311601.
 231941. 300579. 311850. 294275. 190094. 306754. 297968. 276833. 304509.
 249554. 239054. 263840. 297968. 288844. 286344. 183263. 299361. 368776.
 129860. 135900. 319779. 519408. 289733. 335483. 329228. 301583. 247161.
 237790. 304150. 567763. 321160. 242437. 641044. 316030. 261450. 130505.
 272149. 310075. 169320. 221680. 270000. 299589. 156581. 128570. 505087.
 252800. 105700. 184833. 273340. 129000. 276335. 408482. 152710. 320674.
 166210. 562949. 304425. 200475. 309825. 276401. 239370. 185910. 279965.
 171338.]


In [None]:
fig = topic_model_HDB_key.visualize_term_rank(log_scale=True)
fig.show()

### **BERTopic with KMeans + keyBERTinspired Representation**

In [None]:
# Define KMeans model
kmeans_model = KMeans(n_clusters=101, random_state=42)
cluster_labels = kmeans_model.fit_predict(embeddings)
representation_model = KeyBERTInspired()

# Initialize BERTopic with precomputed embeddings and KMeans
topic_model_KMeans_key = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    top_n_words=9,
    verbose=True,
    nr_topics=101,
    min_topic_size=50,
    calculate_probabilities=True
)


# Fit the topic model with precomputed embeddings
topics_kmeans, probs_kmeans = topic_model_KMeans_key.fit_transform(corpus, embeddings=embeddings)

# Get topic information
topic_info_kmeans_key = topic_model_KMeans_key.get_topic_info()
print(topic_info_kmeans_key)

In [None]:
new_topics_kmeans_key = topic_model_KMeans_key.reduce_outliers(corpus, topics, probabilities=probs, strategy="probabilities")

In [None]:
topic_model_KMeans_key.update_topics(corpus, topics=new_topics_kmeans_key, n_gram_range=(1, 3))
topic_model_KMeans_key.get_topic_info()

In [None]:
topic_model_KMeans_key.set_topic_labels(custom_topic_names(topic_model_KMeans_key, num_words=12))
topic_model_KMeans_key.get_topic_info()

### **BERTopic with HDBSCAN + GPT Representation**

In [None]:
# checking amount of tokens in corpus
# rate limit on gpt-3.5-turbo is:
# context window: 16,385 tokens
# max output tokens: 4096 tokens
total_tokens = sum(len(doc.split()) for doc in corpus)
print(f"Total number of tokens: {total_tokens}")

In [None]:
import openai
from bertopic.representation import OpenAI
MY_API_KEY = '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'
# Fine-tune topic representations with GPT
import multiprocessing as mp
mp.set_start_method('spawn', force=True)

client = openai.OpenAI(api_key=MY_API_KEY)
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True, delay_in_seconds=10)

# Initialize BERTopic with precomputed embeddings
topic_model_HDB_GPT = BERTopic(
    embedding_model = embedding_model,
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    top_n_words=9,  # Increase the number of top words to consider per topic
    verbose=True,
    nr_topics=101, # Target number of topics
    min_topic_size = 50,
    calculate_probabilities = True
    )

# Fit the topic model with precomputed embeddings
topics, probs = topic_model_HDB_GPT.fit_transform(corpus, embeddings = embeddings)

# Get topic information
topic_info_HDB_GPT = topic_model_HDB_GPT.get_topic_info()
print(topic_info_HDB_GPT)


In [None]:
topic_model_HDB_GPT.get_document_info(corpus)

In [None]:
topic_model_HDB_GPT.visualize_topics()

In [None]:
from bertopic import evaluation
# Calculate topic coherence
coherence_score = topic_model_HDB_GPT.evaluate_coherence(corpus=corpus, measure="c_v")
print("Coherence Score for BERTopic with External BERT Embeddings and KeyBERTInspired Representation:", coherence_score)

### **BERTopic with Kmeans and GPT Representation**

In [None]:
kmeans_model = KMeans(n_clusters=101, random_state=42)
cluster_labels = kmeans_model.fit_predict(embeddings)
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True, delay_in_seconds = 10)

topic_model_KMEANS_GPT = BERTopic(
    embedding_model = embedding_model,
    kmeans_model = kmeans_model,
    umap_model=umap_model,
    representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    top_n_words=9,  # Increase the number of top words to consider per topic
    verbose=True,
    nr_topics=101, # Target number of topics
    min_topic_size = 50,
    calculate_probabilities = True
    )

topics, probs = topic_model_KMEANS_GPT.fit_transform(corpus, embeddings = embeddings)

# Get topic information
topic_info_KMEANS_GPT = topic_model_KMEANS_GPT.get_topic_info()

In [None]:
topic_model_KMEANS_GPT.get_document_info(corpus)

In [None]:
topic_model_KMEANS_GPT.visualize_topics()

In [None]:
from bertopic import evaluation
# Calculate topic coherence
coherence_score = topic_model_KMEANS_GPT.evaluate_coherence(corpus=corpus, measure="c_v")
print("Coherence Score for BERTopic with External BERT Embeddings and KeyBERTInspired Representation:", coherence_score)