# 7.1 Latent Semantic Analysis (LSA)

**Installed the required Python prerequisite packages and libraries.**

In [None]:
!pip install scikit-learn
!pip install gensim

7.1.3 Implementing LSA in Python.

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Sample text corpus
corpus = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "The cat chased the dog.",
    "The dog chased the cat."
]

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Apply LSA using TruncatedSVD
lsa = TruncatedSVD(n_components=2, random_state=42)
X_reduced = lsa.fit_transform(X)

# Print the terms and their corresponding components
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(lsa.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:5]
    print(f"Topic {i}:")
    for term, weight in sorted_terms:
        print(f" - {term}: {weight:.4f}")

# 7.2 Latent Dirichlet Allocation (LDA)

7.2.3 Implementing LDA in Python

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint

# Sample text corpus
corpus = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "The cat chased the dog.",
    "The dog chased the cat."
]

# Tokenize the text and remove stop words
texts = [[word for word in document.lower().split()] for document in corpus]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert the dictionary to a bag-of-words representation of the corpus
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
lda_model = LdaModel(corpus=corpus_bow, id2word=dictionary, num_topics=2, random_state=42, passes=10)

# Print the topics
print("Topics:")
pprint(lda_model.print_topics(num_words=5))

# Assign topics to a new document
new_doc = "The cat chased the dog."
new_doc_bow = dictionary.doc2bow(new_doc.lower().split())
print("\\nTopic Distribution for the new document:")
pprint(lda_model.get_document_topics(new_doc_bow))

7.2.4 Interpreting LDA Results

In [None]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

# 7.3 Hierarchical Dirichlet Process (HDP)

7.3.3 Implementing HDP in Python

In [None]:
import gensim
from gensim import corpora
from gensim.models import HdpModel
from pprint import pprint

# Sample text corpus
corpus = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "The cat chased the dog.",
    "The dog chased the cat."
]

# Tokenize the text and remove stop words
texts = [[word for word in document.lower().split()] for document in corpus]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert the dictionary to a bag-of-words representation of the corpus
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Train the HDP model
hdp_model = HdpModel(corpus=corpus_bow, id2word=dictionary)

# Print the topics
print("Topics:")
pprint(hdp_model.print_topics(num_topics=2, num_words=5))

# Assign topics to a new document
new_doc = "The cat chased the dog."
new_doc_bow = dictionary.doc2bow(new_doc.lower().split())
print("\\nTopic Distribution for the new document:")
pprint(hdp_model[new_doc_bow])

7.3.4 Interpreting HDP Results

In [None]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Coherence Score
coherence_model_hdp = CoherenceModel(model=hdp_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_hdp = coherence_model_hdp.get_coherence()
print(f"Coherence Score: {coherence_hdp}")