<a href="https://colab.research.google.com/github/koushik-ace/NLP/blob/main/Assignment_6_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LDA with sample data


In [None]:
import pandas as pd

df = pd.read_excel("/content/LDA-Data.xlsx")
df.head()


Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumra took 5 wicket in a match
3,Congress form state government


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (including punkt_tab)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Changed from 'punkt' to 'punkt_tab'
nltk.download('omw-1.4', quiet=True)

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ''

    # 1. Clean Text: convert to lowercase, remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)

    # 2. Word Tokenization
    words = nltk.word_tokenize(text)

    # 3. Stopword removal & 4. Lemmatization
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:  # Remove single character words
            processed_words.append(lemmatizer.lemmatize(word))

    # 5. Rejoin
    return ' '.join(processed_words)

# Handle missing values (if any)
df['News'] = df['News'].fillna('')

# Apply the preprocessing function to the 'News' column
df['Processed_News'] = df['News'].apply(preprocess_text)

print("Original News and Processed News:")
print(df[['News', 'Processed_News']].head())

Original News and Processed News:
                             News                  Processed_News
0   Virat scored century in match      virat scored century match
1            BJP won in elections                    bjp election
2  Bumra took 5 wicket in a match         bumra took wicket match
3  Congress form state government  congress form state government


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the 'Processed_News' column to create the BoW matrix
bow_matrix = vectorizer.fit_transform(df['Processed_News'])

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary (Feature Names):")
print(feature_names)
print("\nShape of BoW matrix:", bow_matrix.shape)

# To display a part of the BoW matrix, convert it to a DataFrame (optional, for better viewing)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)
print("\nFirst 5 rows of the BoW matrix:")
print(bow_df.head())

Vocabulary (Feature Names):
['bjp' 'bumra' 'century' 'congress' 'election' 'form' 'government' 'match'
 'scored' 'state' 'took' 'virat' 'wicket']

Shape of BoW matrix: (4, 13)

First 5 rows of the BoW matrix:
   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [None]:
print(bow_df.head())

   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics (you can change this based on your needs)
num_topics = 2

# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit the model to the BoW matrix
lda_output = lda_model.fit_transform(bow_matrix)

# Display the topics and their top words
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_): # Corrected from components__ to components_
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5
print("\nLDA Topics:")
display_topics(lda_model, feature_names, no_top_words)

# Add the dominant topic to the original DataFrame
df['Dominant_Topic'] = lda_output.argmax(axis=1)

print("\nDataFrame with Dominant Topic:")
print(df[['News', 'Dominant_Topic']].head())


LDA Topics:
Topic 0:
form government congress state election
Topic 1:
match virat century scored took

DataFrame with Dominant Topic:
                             News  Dominant_Topic
0   Virat scored century in match               1
1            BJP won in elections               0
2  Bumra took 5 wicket in a match               1
3  Congress form state government               0


In [None]:
#Display the topics and their top words again in this cell
# Reusing the 'display_topics' function, 'lda_model', 'feature_names', and 'no_top_words' from previous execution
print("LDA Topics:")
display_topics(lda_model, feature_names, no_top_words)

print("\nDataFrame with News and their Dominant Topic:")
print(df[['News', 'Dominant_Topic']].head())

LDA Topics:
Topic 0:
form government congress state election
Topic 1:
match virat century scored took

DataFrame with News and their Dominant Topic:
                             News  Dominant_Topic
0   Virat scored century in match               1
1            BJP won in elections               0
2  Bumra took 5 wicket in a match               1
3  Congress form state government               0


# LDA with kaggle data

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('omw-1.4', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = nltk.word_tokenize(text)
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:
            processed_words.append(lemmatizer.lemmatize(word))
    return ' '.join(processed_words)

# LOAD
df = pd.read_csv("/content/arxiv_data.csv", on_bad_lines='skip', engine='python', nrows=1000)

# PREPROCESS
df['titles'] = df['titles'].fillna('')
df['summaries'] = df['summaries'].fillna('')
df['text_content'] = df['titles'] + ' ' + df['summaries']
df['Processed_Text'] = df['text_content'].apply(preprocess_text)

# BOW
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['Processed_Text'])
feature_names = vectorizer.get_feature_names_out()

# OUTPUT
num_topics = 2
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_output = lda_model.fit_transform(bow_matrix)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("LDA Topics:")
display_topics(lda_model, feature_names, 5)
df['Dominant_Topic'] = lda_output.argmax(axis=1)
print("\nDocument and Topic:")
print(df[['text_content', 'Dominant_Topic']])

LDA Topics:
Topic 0:
segmentation image network method model
Topic 1:
image segmentation domain method learning

Document and Topic:
                                          text_content  Dominant_Topic
0    Survey on Semantic Stereo Matching / Semantic ...               0
1    FUTURE-AI: Guiding Principles and Consensus Re...               1
2    Enforcing Mutual Consistency of Hard Regions f...               0
3    Parameter Decoupling Strategy for Semi-supervi...               0
4    Background-Foreground Segmentation for Interio...               0
..                                                 ...             ...
995  DeepIGeoS: A Deep Interactive Geodesic Framewo...               0
996  3D Densely Convolutional Networks for Volumetr...               0
997  UI-Net: Interactive Artificial Neural Networks...               0
998  One-Shot Learning for Semantic Segmentation Lo...               0
999  Exploring and Exploiting Diversity for Image S...               0

[1000 rows x 2

# NMF with kaggle data

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('omw-1.4', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = nltk.word_tokenize(text)
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:
            processed_words.append(lemmatizer.lemmatize(word))
    return ' '.join(processed_words)

# LOAD
df = pd.read_csv("/content/arxiv_data.csv", on_bad_lines='skip', engine='python', nrows=1000)

# PREPROCESS
df['titles'] = df['titles'].fillna('')
df['summaries'] = df['summaries'].fillna('')
df['text_content'] = df['titles'] + ' ' + df['summaries']
df['Processed_Text'] = df['text_content'].apply(preprocess_text)

# BOW
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['Processed_Text'])
feature_names = vectorizer.get_feature_names_out()

# OUTPUT
num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_output = nmf_model.fit_transform(bow_matrix)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("NMF Topics:")
display_topics(nmf_model, feature_names, 5)
df['Dominant_Topic'] = nmf_output.argmax(axis=1)
print("\nDocument and Topic:")
print(df[['text_content', 'Dominant_Topic']])

NMF Topics:
Topic 0:
segmentation network learning model method
Topic 1:
image segmentation method based using

Document and Topic:
                                          text_content  Dominant_Topic
0    Survey on Semantic Stereo Matching / Semantic ...               0
1    FUTURE-AI: Guiding Principles and Consensus Re...               1
2    Enforcing Mutual Consistency of Hard Regions f...               0
3    Parameter Decoupling Strategy for Semi-supervi...               0
4    Background-Foreground Segmentation for Interio...               0
..                                                 ...             ...
995  DeepIGeoS: A Deep Interactive Geodesic Framewo...               0
996  3D Densely Convolutional Networks for Volumetr...               0
997  UI-Net: Interactive Artificial Neural Networks...               0
998  One-Shot Learning for Semantic Segmentation Lo...               1
999  Exploring and Exploiting Diversity for Image S...               0

[1000 rows x 2 

# NMF with sample data


In [None]:
import pandas as pd

df = pd.read_excel("/content/LDA-Data.xlsx")
df.head()


Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumra took 5 wicket in a match
3,Congress form state government


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (including punkt_tab)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Changed from 'punkt' to 'punkt_tab'
nltk.download('omw-1.4', quiet=True)

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ''

    # 1. Clean Text: convert to lowercase, remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)

    # 2. Word Tokenization
    words = nltk.word_tokenize(text)

    # 3. Stopword removal & 4. Lemmatization
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:  # Remove single character words
            processed_words.append(lemmatizer.lemmatize(word))

    # 5. Rejoin
    return ' '.join(processed_words)

# Handle missing values (if any)
df['News'] = df['News'].fillna('')

# Apply the preprocessing function to the 'News' column
df['Processed_News'] = df['News'].apply(preprocess_text)

print("Original News and Processed News:")
print(df[['News', 'Processed_News']].head())

Original News and Processed News:
                             News                  Processed_News
0   Virat scored century in match      virat scored century match
1            BJP won in elections                    bjp election
2  Bumra took 5 wicket in a match         bumra took wicket match
3  Congress form state government  congress form state government


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the 'Processed_News' column to create the BoW matrix
bow_matrix = vectorizer.fit_transform(df['Processed_News'])

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary (Feature Names):")
print(feature_names)
print("\nShape of BoW matrix:", bow_matrix.shape)

# To display a part of the BoW matrix, convert it to a DataFrame (optional, for better viewing)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)
print("\nFirst 5 rows of the BoW matrix:")
print(bow_df.head())

Vocabulary (Feature Names):
['bjp' 'bumra' 'century' 'congress' 'election' 'form' 'government' 'match'
 'scored' 'state' 'took' 'virat' 'wicket']

Shape of BoW matrix: (4, 13)

First 5 rows of the BoW matrix:
   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [None]:
from sklearn.decomposition import NMF

# Define the number of topics (you can change this based on your needs)
num_topics = 2

# Initialize NMF model
nmf_model = NMF(n_components=num_topics, random_state=42, init='nndsvda', tol=5e-3)

# Fit the model to the BoW matrix
nmf_output = nmf_model.fit_transform(bow_matrix)

# Display the topics and their top words
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5
print("\nNMF Topics:")
display_topics(nmf_model, feature_names, no_top_words)

# Add the dominant topic to the original DataFrame
df['Dominant_Topic'] = nmf_output.argmax(axis=1)

print("\nDataFrame with Dominant Topic:")
print(df[['News', 'Dominant_Topic']].head())


NMF Topics:
Topic 0:
match scored virat century wicket
Topic 1:
state form congress government election

DataFrame with Dominant Topic:
                             News  Dominant_Topic
0   Virat scored century in match               0
1            BJP won in elections               1
2  Bumra took 5 wicket in a match               0
3  Congress form state government               1


# NMF with sample data tf idf


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('omw-1.4', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = nltk.word_tokenize(text)
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:
            processed_words.append(lemmatizer.lemmatize(word))
    return ' '.join(processed_words)

# LOAD
data = {'News': ['Virat scored century in match', 'BJP won in elections', 'Bumra took 5 wicket in a match', 'Congress form state government']}
df = pd.DataFrame(data)

# PREPROCESS
df['News'] = df['News'].fillna('')
df['Processed_News'] = df['News'].apply(preprocess_text)

# TFIDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Processed_News'])
feature_names = vectorizer.get_feature_names_out()

# OUTPUT
num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_output = nmf_model.fit_transform(tfidf_matrix)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("NMF Topics:")
display_topics(nmf_model, feature_names, 5)
df['Dominant_Topic'] = nmf_output.argmax(axis=1)
print("\nDocument and Topic:")
print(df[['News', 'Dominant_Topic']])

NMF Topics:
Topic 0:
match bumra wicket took virat
Topic 1:
election bjp form government state

Document and Topic:
                             News  Dominant_Topic
0   Virat scored century in match               0
1            BJP won in elections               1
2  Bumra took 5 wicket in a match               0
3  Congress form state government               1


# NMF with kaggle data TF IDF

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('omw-1.4', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = nltk.word_tokenize(text)
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:
            processed_words.append(lemmatizer.lemmatize(word))
    return ' '.join(processed_words)

# LOAD
df = pd.read_csv("/content/arxiv_data.csv", on_bad_lines='skip', engine='python', nrows=1000)

# PREPROCESS
df['titles'] = df['titles'].fillna('')
df['summaries'] = df['summaries'].fillna('')
df['text_content'] = df['titles'] + ' ' + df['summaries']
df['Processed_Text'] = df['text_content'].apply(preprocess_text)

# TFIDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Processed_Text'])
feature_names = vectorizer.get_feature_names_out()

# OUTPUT
num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_output = nmf_model.fit_transform(tfidf_matrix)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("NMF Topics:")
display_topics(nmf_model, feature_names, 5)
df['Dominant_Topic'] = nmf_output.argmax(axis=1)
print("\nDocument and Topic:")
print(df[['text_content', 'Dominant_Topic']])

NMF Topics:
Topic 0:
segmentation image network method model
Topic 1:
domain supervised data learning annotation

Document and Topic:
                                          text_content  Dominant_Topic
0    Survey on Semantic Stereo Matching / Semantic ...               0
1    FUTURE-AI: Guiding Principles and Consensus Re...               0
2    Enforcing Mutual Consistency of Hard Regions f...               1
3    Parameter Decoupling Strategy for Semi-supervi...               1
4    Background-Foreground Segmentation for Interio...               0
..                                                 ...             ...
995  DeepIGeoS: A Deep Interactive Geodesic Framewo...               0
996  3D Densely Convolutional Networks for Volumetr...               0
997  UI-Net: Interactive Artificial Neural Networks...               0
998  One-Shot Learning for Semantic Segmentation Lo...               0
999  Exploring and Exploiting Diversity for Image S...               0

[1000 rows x 