# LDA DEMO

In [11]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')


# Sample documents for demonstration

In [12]:
documents = [
   "Natural language processing is a subfield of artificial intelligence.",
   "Latent Dirichlet Allocation is a generative probabilistic model.",
   "Topic modeling is used to identify topics present in a corpus of text.",
   "Gensim is a popular Python library for topic modeling and document similarity.",
   "Genshin is a good game.",
   "Peter piper picked a peck of pickled peppers."
]

# Preprocess the documents

In [13]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

# Print topics and their keywords

In [14]:
pprint(lda_model.print_topics())

[(0,
  '0.078*"pick" + 0.078*"peck" + 0.078*"Peter" + 0.078*"pickle" + '
  '0.078*"pepper" + 0.078*"piper" + 0.020*"topic" + 0.020*"modeling" + '
  '0.020*"Genshin" + 0.020*"good"'),
 (1,
  '0.087*"topic" + 0.049*"model" + 0.049*"generative" + 0.049*"probabilistic" '
  '+ 0.049*"Latent" + 0.049*"Allocation" + 0.049*"Dirichlet" + '
  '0.049*"identify" + 0.049*"text" + 0.049*"present"'),
 (2,
  '0.053*"modeling" + 0.053*"similarity" + 0.053*"library" + 0.053*"Python" + '
  '0.053*"popular" + 0.053*"document" + 0.053*"Gensim" + 0.053*"processing" + '
  '0.053*"subfield" + 0.053*"intelligence"')]


# Assign topics to documents

In [15]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.04811856), (1, 0.04792251), (2, 0.903959)]
Document 2 - Topic: [(0, 0.048154198), (1, 0.90387106), (2, 0.047974713)]
Document 3 - Topic: [(0, 0.042063866), (1, 0.9140752), (2, 0.043860964)]
Document 4 - Topic: [(0, 0.037418883), (1, 0.039438017), (2, 0.92314315)]
Document 5 - Topic: [(0, 0.08430949), (1, 0.8317136), (2, 0.083976954)]
Document 6 - Topic: [(0, 0.9043399), (1, 0.047821626), (2, 0.047838427)]


#                   

# Mini Exercise hehe

Instructions:

Use the provided Python code to perform topic modeling on a set of sample documents.
Modify the sample documents or add your own to see how the results change.
Experiment with the number of topics (parameter: num_topics) in the LDA model. Observe how different numbers of topics impact the results

Make a small insight on what you have observe when you change, increase, or decrease some parameters.(Short Essay lang)

num_topic = 3: ![image.png](attachment:image.png) ![image-6.png](attachment:image-6.png)

num_topic = 4: ![image-2.png](attachment:image-2.png) ![image-5.png](attachment:image-5.png)

num_topic = 5: ![image-3.png](attachment:image-3.png) ![image-4.png](attachment:image-4.png)

Insight:

Experimenting with the number of topics in Latent Dirichlet Allocation (LDA) models reveals a delicate balance: increasing topics adds granularity but risks overfitting, while decreasing them offers generality but may oversimplify.