# Chapter 12

# 12.8.2. Clustering biomedical research papers by topic

In [1]:
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Example biomedical texts (research papers or clinical notes)
documents = [
    "The patient was diagnosed with lung cancer and underwent chemotherapy.",
    "This study investigates the genetic mutations in breast cancer patients.",
    "The drug metformin is commonly used for treating diabetes.",
    "Hypertension is a risk factor for heart disease and stroke.",
    "The gene BRCA1 is linked to breast cancer risk."
]

# Preprocessing the text: Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
texts = [[word for word in word_tokenize(doc.lower()) if word.isalnum() and word not in stop_words] for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA for topic modeling
lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

# Print the discovered topics
topics = lda_model.print_topics(num_words=5)
for i, topic in topics:
    print(f"Topic {i+1}: {topic}")


Topic 1: 0.037*"cancer" + 0.037*"breast" + 0.037*"used" + 0.037*"commonly" + 0.037*"brca1"
Topic 2: 0.087*"risk" + 0.049*"heart" + 0.049*"factor" + 0.049*"drug" + 0.049*"hypertension"
Topic 3: 0.107*"cancer" + 0.061*"breast" + 0.060*"investigates" + 0.060*"mutations" + 0.060*"study"


In [3]:
import pyLDAvis
import pyLDAvis.gensim_models

# Visualize the topics
lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_visualization)
