In [None]:
#To perform topic modeling using Latent Dirichlet Allocation (LDA) in Python, we need to install the gensim library
#pip install gensim

In [2]:
import gensim
from gensim import corpora
from pprint import pprint

In [3]:
# Sample documents
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Natural language processing is an important aspect of AI.",
    "Deep learning has shown remarkable results in various applications.",
    "Topic modeling helps in discovering hidden themes in a collection of documents.",
    "Python is a popular programming language for data science and machine learning.",
]

In [4]:
# Tokenize and preprocess the documents
tokenized_documents = [gensim.utils.simple_preprocess(doc) for doc in documents]

In [5]:
# Create a dictionary and corpus
dictionary = corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

In [6]:
# Build LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=2,  # You can adjust the number of topics as needed
    random_state=42,
    passes=10,     # Number of passes over the corpus
)

In [7]:
# Print the topics
pprint(lda_model.print_topics())

[(0,
  '0.070*"in" + 0.070*"of" + 0.041*"topic" + 0.041*"discovering" + '
  '0.041*"documents" + 0.041*"helps" + 0.041*"themes" + 0.041*"hidden" + '
  '0.041*"modeling" + 0.041*"collection"'),
 (1,
  '0.053*"learning" + 0.053*"is" + 0.053*"language" + 0.032*"machine" + '
  '0.032*"for" + 0.032*"science" + 0.032*"data" + 0.032*"python" + 0.032*"and" '
  '+ 0.032*"programming"')]


In [8]:
# Get the dominant topic for each document
for i, doc in enumerate(corpus):
    topic_distribution = lda_model[doc]
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    print(f"Document {i+1}: Dominant Topic - {dominant_topic}")

Document 1: Dominant Topic - 0
Document 2: Dominant Topic - 1
Document 3: Dominant Topic - 1
Document 4: Dominant Topic - 0
Document 5: Dominant Topic - 1
