<a href="https://colab.research.google.com/github/heena2248/Text_Mining/blob/main/textmining6_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Topic Modeling is used to discover hidden semantic patterns in a text corpus and thus identify topics in the corpus. The techniques demonstrated are:
 - LSA
 - LDA
 - NMF
 - PCA
 - ICA


### LSA and LDA

In [None]:
doc_1 = "A whopping 96.5 percent of water on Earth is in our oceans, covering 71 percent of the surface of our planet. And at any given time, about 0.001 percent is floating above us in the atmosphere. If all of that water fell as rain at once, the whole planet would get about 1 inch of rain."

doc_2 = "One-third of your life is spent sleeping. Sleeping 7-9 hours each night should help your body heal itself, activate the immune system, and give your heart a break. Beyond that--sleep experts are still trying to learn more about what happens once we fall asleep."

doc_3 = "A newborn baby is 78 percent water. Adults are 55-60 percent water. Water is involved in just about everything our body does."

doc_4 = "While still in high school, a student went 264.4 hours without sleep, for which he won first place in the 10th Annual Great San Diego Science Fair in 1964."

doc_5 = "We experience water in all three states: solid ice, liquid water, and gas water vapor."

# Create corpus
corpus = [doc_1, doc_2, doc_3, doc_4, doc_5]

In [None]:
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# remove stopwords, punctuation, and normalize the corpus
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

clean_corpus = [clean(doc).split() for doc in corpus]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
from gensim import corpora

# Creating document-term matrix
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

In [None]:
from gensim.models import LsiModel

# LSA model
lsa = LsiModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# LSA model
print(lsa.print_topics(num_topics=3, num_words=3))

[(0, '0.555*"water" + 0.489*"percent" + 0.239*"rain"'), (1, '-0.361*"sleeping" + -0.215*"still" + -0.215*"hour"'), (2, '-0.562*"water" + 0.231*"rain" + 0.231*"planet"')]


In [None]:
# Get topic distribution for each document
# This gives the distribution of each document across topics
doc_topic_distributions = [lsa[doc] for doc in doc_term_matrix]

# Display the topic distribution for each document
for i, topic_dist in enumerate(doc_topic_distributions):
    print(f"\nDocument {i}:")
    for topic, contribution in topic_dist:
        print(f"  Topic {topic}: {contribution:.4f}")

# Find the dominant topic for each document
for i, topic_dist in enumerate(doc_topic_distributions):
    dominant_topic = max(topic_dist, key=lambda x: x[1])  # max by contribution
    print(f"Document {i} is most related to Topic {dominant_topic[0]} with a contribution of {dominant_topic[1]:.4f}")


Document 0:
  Topic 0: 5.9267
  Topic 1: 0.3096
  Topic 2: 2.3749

Document 1:
  Topic 0: 0.1583
  Topic 1: -5.3127
  Topic 2: 0.2414

Document 2:
  Topic 0: 3.2349
  Topic 1: -0.2740
  Topic 2: -2.6690

Document 3:
  Topic 0: 0.0104
  Topic 1: -1.0183
  Topic 2: 0.3135

Document 4:
  Topic 0: 1.9863
  Topic 1: -0.0489
  Topic 2: -2.7604
Document 0 is most related to Topic 0 with a contribution of 5.9267
Document 1 is most related to Topic 2 with a contribution of 0.2414
Document 2 is most related to Topic 0 with a contribution of 3.2349
Document 3 is most related to Topic 2 with a contribution of 0.3135
Document 4 is most related to Topic 0 with a contribution of 1.9863


In [None]:
from gensim.models import LdaModel

# LDA model
lda = LdaModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# Results
print(lda.print_topics(num_topics=3, num_words=3))



[(0, '0.030*"sleeping" + 0.025*"hour" + 0.022*"still"'), (1, '0.038*"percent" + 0.032*"rain" + 0.030*"planet"'), (2, '0.087*"water" + 0.046*"percent" + 0.019*"body"')]


In [None]:
# Get topic distribution for each document
# This gives the distribution of each document across topics
doc_topic_distributions = [lda.get_document_topics(doc) for doc in doc_term_matrix]

# Display the topic distribution for each document
for i, topic_dist in enumerate(doc_topic_distributions):
    print(f"\nDocument {i}:")
    for topic, contribution in topic_dist:
        print(f"  Topic {topic}: {contribution:.4f}")

# Find the dominant topic for each document
for i, topic_dist in enumerate(doc_topic_distributions):
    dominant_topic = max(topic_dist, key=lambda x: x[1])  # max by contribution
    print(f"Document {i} is most related to Topic {dominant_topic[0]} with a contribution of {dominant_topic[1]:.4f}")


Document 0:
  Topic 0: 0.0112
  Topic 1: 0.9756
  Topic 2: 0.0132

Document 1:
  Topic 0: 0.9750
  Topic 1: 0.0125
  Topic 2: 0.0125

Document 2:
  Topic 0: 0.0225
  Topic 1: 0.0230
  Topic 2: 0.9545

Document 3:
  Topic 0: 0.9629
  Topic 1: 0.0170
  Topic 2: 0.0201

Document 4:
  Topic 0: 0.0280
  Topic 1: 0.0283
  Topic 2: 0.9436
Document 0 is most related to Topic 1 with a contribution of 0.9756
Document 1 is most related to Topic 0 with a contribution of 0.9750
Document 2 is most related to Topic 2 with a contribution of 0.9545
Document 3 is most related to Topic 0 with a contribution of 0.9629
Document 4 is most related to Topic 2 with a contribution of 0.9436


### NMF

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
# Step 1: Preprocess and vectorize the text
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

In [None]:
# Step 2: Apply NMF
num_topics = 2  # Specify the number of topics you want to extract
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf)

# Step 3: Display the topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5  # Number of top words to display
display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out(), no_top_words)

Topic 1:
water percent rain planet experience
Topic 2:
sleeping sleep hours 10th went


### PCA and ICA

In [None]:
from sklearn.decomposition import PCA, FastICA

In [None]:
# Step 2: Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(tfidf.toarray())

# Display PCA components
print("PCA Components:")
for i, component in enumerate(pca.components_):
    print(f"Component {i + 1}:")
    top_indices = component.argsort()[-5:][::-1]  # Top 5 words
    top_words = [tfidf_vectorizer.get_feature_names_out()[index] for index in top_indices]
    print("Top words:", top_words)

PCA Components:
Component 1:
Top words: ['water', 'percent', 'planet', 'rain', 'ice']
Component 2:
Top words: ['sleeping', 'help', 'spent', 'night', 'immune']


In [None]:
# Step 3: Apply ICA
ica = FastICA(n_components=2, random_state=42)
ica_result = ica.fit_transform(tfidf.toarray())

# Display ICA components
print("\nICA Components:")
for i, component in enumerate(ica.components_):
    print(f"Component {i + 1}:")
    top_indices = component.argsort()[-5:][::-1]  # Top 5 words
    top_words = [tfidf_vectorizer.get_feature_names_out()[index] for index in top_indices]
    print("Top words:", top_words)


ICA Components:
Component 1:
Top words: ['water', 'percent', 'planet', 'rain', 'ice']
Component 2:
Top words: ['sleeping', 'help', 'learn', 'life', 'immune']


### POS Tagging

In [None]:
import nltk
from nltk import word_tokenize, pos_tag

# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample sentence
sentence = "The quick lazy fox jumps over the lazy dog."

# Tokenize the sentence into words
words = word_tokenize(sentence)

# Perform POS tagging
pos_tags = pos_tag(words)

# Display the POS tags
print(pos_tags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('The', 'DT'), ('quick', 'JJ'), ('lazy', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]
