In [3]:
import pandas as pd
import numpy as np
import os

## Set Path

In [4]:
LF_result_path = ""
LF_export_path = ""
HY_result_path = "/Users/huiziyu/Library/CloudStorage/GoogleDrive-huiziy@g.ucla.edu/My Drive/Project - LLM in Biomedical & Health/new/results/"
HY_export_path = "/Users/huiziyu/Library/CloudStorage/GoogleDrive-huiziy@g.ucla.edu/My Drive/Project - LLM in Biomedical & Health/new/results/"
HY_import_path = "/Users/huiziyu/Library/CloudStorage/GoogleDrive-huiziy@g.ucla.edu/My Drive/Project - LLM in Biomedical & Health/new_data/processed/"

In [5]:
result_path = HY_result_path
data_export_path = HY_export_path
data_import_path = HY_import_path

In [6]:
df = pd.read_csv(data_import_path+"output_data_en.csv")

In [7]:
# replace na with ""
df.fillna("",inplace=True)

In [8]:
abstract_df = df[['display_name','doi','abstract_text']].reset_index(drop=True)

In [9]:
# We use title and abstract, as well focus on sentence-line context aware info, not keywords.
abstract_df["Title_abstract"] = abstract_df["display_name"]+". "+abstract_df["abstract_text"]
# now titel only
abstract_df["Title_abstract"] = abstract_df["display_name"]

### Topic Modeling

In [None]:
!pip install sentence_transformers
!pip install umap
!pip install sklearn
!pip instlal hdbscan

In [11]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
docs = [str(item) for item in list(abstract_df.Title_abstract)]

## Add additional stopwords based on the query

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Your additional stop words
additional_stop_words = {'large language model', 'GPT', 'ChatGPT', 'Llama', 'Google Palm', 'Anthropic Claude', 'health', 'medical'}

# Combine default English stop words with your additional ones
custom_stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

In [None]:
# [TODO 1] need to consider remove query keywords
# [TODO 2] remove non-English
# [TODO 3] cluster number: more experiments [20, 30, 40, 50, 100, 150, 200]
# 200 too many
vectorizer_model = CountVectorizer(stop_words=custom_stop_words)
# hdbscan_model = HDBSCAN(min_cluster_size=10,
#                         min_samples=5,
#                         metric='euclidean',
#                         cluster_selection_method='eom',
#                         prediction_data=True) # random_state=42
cluster_model = KMeans(n_clusters=200)
topic_model = BERTopic(hdbscan_model=cluster_model, #hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       top_n_words=10) # nr_topics=100,
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.save(result_path+"/LLM_title_200")

In [None]:
abstract_df["topics"] = topics

In [None]:
abstract_df

In [None]:
# show topics info
topic_model.get_topic_info()

In [15]:
topic_model.get_topic_info().to_csv(data_export_path+"/LLM_en_openalex_title_topics_metadata_200.csv",index=False)

In [16]:
# show Topic Positions -> suggestions for further clustering
#topic_model.visualize_topics()
fig_IDM = topic_model.visualize_topics()
fig_IDM.write_html(result_path+"/LLM_en_openalex_title_viz_IDM_200.html")

In [17]:
# visualize Topic Hierarchy -> suggestions for further clustering
fig_hierarchy = topic_model.visualize_hierarchy()
fig_hierarchy.write_html(result_path+"/LLM_en_openalex_title_viz_hierarchy_200.html")

In [18]:
# visualize words
fig_topic_word_scores = topic_model.visualize_barchart(top_n_topics=32)
fig_topic_word_scores.write_html(result_path+"/LLM_en_openalex_title_viz_topic_word_scores_200.html")

In [19]:
# visualize topic similarity -- not useful in our case, maybe include to show not correlated
fig_heatmap = topic_model.visualize_heatmap()
fig_heatmap.write_html(result_path+"/LLM_en_openalex_title_viz_heatmap_200.html")

In [20]:
# visualize documents
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# save embeddings (for Tableau)
abstract_df["x"] = reduced_embeddings[:, 0]
abstract_df["y"] = reduced_embeddings[:, 1]


In [21]:
import pickle
with open(result_path+'/LLM_en_openalex_title_embeddings_200.pickle', 'wb') as pkl:
    pickle.dump(embeddings, pkl)

In [22]:
abstract_df.to_csv(data_export_path+"/LLM_en_openalex_title_w_topics_years_positions_200.csv",index=False)