<a href="https://colab.research.google.com/github/koushik-ace/NLP/blob/main/Assignment_6_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LDA with sample data


In [1]:
import pandas as pd

df = pd.read_excel("/content/LDA-Data.xlsx")
df.head()


Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumra took 5 wicket in a match
3,Congress form state government


In [2]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (including punkt_tab)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Changed from 'punkt' to 'punkt_tab'
nltk.download('omw-1.4', quiet=True)

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ''

    # 1. Clean Text: convert to lowercase, remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)

    # 2. Word Tokenization
    words = nltk.word_tokenize(text)

    # 3. Stopword removal & 4. Lemmatization
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:  # Remove single character words
            processed_words.append(lemmatizer.lemmatize(word))

    # 5. Rejoin
    return ' '.join(processed_words)

# Handle missing values (if any)
df['News'] = df['News'].fillna('')

# Apply the preprocessing function to the 'News' column
df['Processed_News'] = df['News'].apply(preprocess_text)

print("Original News and Processed News:")
print(df[['News', 'Processed_News']].head())

Original News and Processed News:
                             News                  Processed_News
0   Virat scored century in match      virat scored century match
1            BJP won in elections                    bjp election
2  Bumra took 5 wicket in a match         bumra took wicket match
3  Congress form state government  congress form state government


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the 'Processed_News' column to create the BoW matrix
bow_matrix = vectorizer.fit_transform(df['Processed_News'])

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary (Feature Names):")
print(feature_names)
print("\nShape of BoW matrix:", bow_matrix.shape)

# To display a part of the BoW matrix, convert it to a DataFrame (optional, for better viewing)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)
print("\nFirst 5 rows of the BoW matrix:")
print(bow_df.head())

Vocabulary (Feature Names):
['bjp' 'bumra' 'century' 'congress' 'election' 'form' 'government' 'match'
 'scored' 'state' 'took' 'virat' 'wicket']

Shape of BoW matrix: (4, 13)

First 5 rows of the BoW matrix:
   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [4]:
print(bow_df.head())

   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [5]:
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics (you can change this based on your needs)
num_topics = 2

# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit the model to the BoW matrix
lda_output = lda_model.fit_transform(bow_matrix)

# Display the topics and their top words
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_): # Corrected from components__ to components_
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5
print("\nLDA Topics:")
display_topics(lda_model, feature_names, no_top_words)

# Add the dominant topic to the original DataFrame
df['Dominant_Topic'] = lda_output.argmax(axis=1)

print("\nDataFrame with Dominant Topic:")
print(df[['News', 'Dominant_Topic']].head())


LDA Topics:
Topic 0:
form government congress state election
Topic 1:
match virat century scored took

DataFrame with Dominant Topic:
                             News  Dominant_Topic
0   Virat scored century in match               1
1            BJP won in elections               0
2  Bumra took 5 wicket in a match               1
3  Congress form state government               0


In [6]:
#Display the topics and their top words again in this cell
# Reusing the 'display_topics' function, 'lda_model', 'feature_names', and 'no_top_words' from previous execution
print("LDA Topics:")
display_topics(lda_model, feature_names, no_top_words)

print("\nDataFrame with News and their Dominant Topic:")
print(df[['News', 'Dominant_Topic']].head())

LDA Topics:
Topic 0:
form government congress state election
Topic 1:
match virat century scored took

DataFrame with News and their Dominant Topic:
                             News  Dominant_Topic
0   Virat scored century in match               1
1            BJP won in elections               0
2  Bumra took 5 wicket in a match               1
3  Congress form state government               0


# LDA with kaggle data

In [7]:
import pandas as pd

# Try with on_bad_lines parameter
df = pd.read_csv("/content/arxiv_data.csv",
                 on_bad_lines='skip',  # Skip problematic rows
                 engine='python')       # Use Python engine (more forgiving)

df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [8]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the CSV file with error handling
df = pd.read_csv("/content/arxiv_data.csv",
                 on_bad_lines='skip',
                 engine='python')

print(f"Loaded {len(df)} rows")
print("Columns in dataset:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ''

    # 1. Clean Text: convert to lowercase, remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)

    # 2. Word Tokenization
    words = nltk.word_tokenize(text)

    # 3. Stopword removal & 4. Lemmatization
    processed_words = []
    for word in words:
        if word not in stop_words and len(word) > 1:  # Remove single character words
            processed_words.append(lemmatizer.lemmatize(word))

    # 5. Rejoin
    return ' '.join(processed_words)

# Handle missing values in 'titles' and 'summaries' by filling with empty strings
df['titles'] = df['titles'].fillna('')
df['summaries'] = df['summaries'].fillna('')

# Combine 'titles' and 'summaries' into a new 'text_content' column
df['text_content'] = df['titles'] + ' ' + df['summaries']

# Apply the preprocessing function to the 'text_content' column
print("\nPreprocessing text... This may take a while for large datasets.")
df['Processed_Text'] = df['text_content'].apply(preprocess_text)

print("\n" + "="*70)
print("Original Text Content and Processed Text:")
print("="*70)
print(df[['text_content', 'Processed_Text']].head())

# Additional useful information
print("\n" + "="*70)
print("Dataset Statistics:")
print("="*70)
print(f"Total rows: {len(df)}")
print(f"Rows with empty processed text: {(df['Processed_Text'] == '').sum()}")
print(f"Average processed text length: {df['Processed_Text'].str.len().mean():.2f} characters")

Loaded 51774 rows
Columns in dataset: ['titles', 'summaries', 'terms']

First few rows:
                                              titles  \
0  Survey on Semantic Stereo Matching / Semantic ...   
1  FUTURE-AI: Guiding Principles and Consensus Re...   
2  Enforcing Mutual Consistency of Hard Regions f...   
3  Parameter Decoupling Strategy for Semi-supervi...   
4  Background-Foreground Segmentation for Interio...   

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                         terms  
0           ['cs.CV', 'cs.LG']  
1  ['cs.CV', 'cs.AI', 'cs.LG']  
2           ['cs.CV', 'cs.AI']  
3                    ['cs.CV']  
4           ['cs.CV', 'cs.LG']  

Preprocessing text... This may take a w

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the 'Processed_Text' column to create the BoW matrix
bow_matrix = vectorizer.fit_transform(df['Processed_Text'])

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("="*70)
print("Bag of Words (BoW) Analysis")
print("="*70)

print("\nVocabulary Size:", len(feature_names))
print("\nFirst 20 words in vocabulary:")
print(feature_names[:20])

print("\n" + "="*70)
print("BoW Matrix Shape:", bow_matrix.shape)
print(f"  - Number of documents: {bow_matrix.shape[0]}")
print(f"  - Number of unique words: {bow_matrix.shape[1]}")
print("="*70)

# To display a part of the BoW matrix, convert it to a DataFrame (optional, for better viewing)
# Note: Only convert a sample if the matrix is very large
print("\nConverting BoW matrix to DataFrame (this may take a moment for large datasets)...")

# For large datasets, only show first few rows and columns
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)

print("\nFirst 5 rows of the BoW matrix (showing first 10 columns):")
print(bow_df.iloc[:5, :10])

print("\nFull BoW matrix for first 5 documents (all features):")
print(bow_df.head())

# Show some statistics
print("\n" + "="*70)
print("BoW Statistics:")
print("="*70)
print(f"Total word occurrences: {bow_matrix.sum()}")
print(f"Average words per document: {bow_matrix.sum(axis=1).mean():.2f}")
print(f"Sparsity: {(1 - bow_matrix.nnz / (bow_matrix.shape[0] * bow_matrix.shape[1])) * 100:.2f}%")

Bag of Words (BoW) Analysis

Vocabulary Size: 51361

First 20 words in vocabulary:
['aa' 'aaa' 'aaae' 'aaai' 'aabb' 'aabo' 'aac' 'aaca' 'aachen' 'aacp'
 'aacvp' 'aad' 'aada' 'aadcnn' 'aade' 'aadi' 'aadit' 'aads' 'aadsah' 'aae']

BoW Matrix Shape: (51774, 51361)
  - Number of documents: 51774
  - Number of unique words: 51361

Converting BoW matrix to DataFrame (this may take a moment for large datasets)...

First 5 rows of the BoW matrix (showing first 10 columns):
   aa  aaa  aaae  aaai  aabb  aabo  aac  aaca  aachen  aacp
0   0    0     0     0     0     0    0     0       0     0
1   0    0     0     0     0     0    0     0       0     0
2   0    0     0     0     0     0    0     0       0     0
3   0    0     0     0     0     0    0     0       0     0
4   0    0     0     0     0     0    0     0       0     0

Full BoW matrix for first 5 documents (all features):
   aa  aaa  aaae  aaai  aabb  aabo  aac  aaca  aachen  aacp  ...  zxhresearch  \
0   0    0     0     0     0     0

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with restrictions
vectorizer = CountVectorizer(
    max_features=1000,      # Only use top 1000 most frequent words
    min_df=5,               # Ignore words appearing in less than 5 documents
    max_df=0.8              # Ignore words appearing in more than 80% of documents
)

# Fit and transform the 'Processed_Text' column to create the BoW matrix
bow_matrix = vectorizer.fit_transform(df['Processed_Text'])

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary (Feature Names):")
print(feature_names)
print("\nShape of BoW matrix:", bow_matrix.shape)

# To display a part of the BoW matrix, convert it to a DataFrame (optional, for better viewing)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)
print("\nFirst 5 rows of the BoW matrix:")
print(bow_df.head())

Vocabulary (Feature Names):
['ability' 'able' 'according' 'account' 'accuracy' 'accurate' 'accurately'
 'achieve' 'achieved' 'achieves' 'achieving' 'across' 'action'
 'activation' 'active' 'activity' 'actor' 'adapt' 'adaptation' 'adaptive'
 'addition' 'additional' 'additionally' 'address' 'advance' 'advantage'
 'adversarial' 'agent' 'aggregation' 'agnostic' 'ai' 'aim' 'al'
 'algorithm' 'alignment' 'allow' 'allowing' 'allows' 'along' 'also'
 'alternative' 'although' 'among' 'amount' 'analysis' 'analyze'
 'annotated' 'annotation' 'anomaly' 'another' 'appearance' 'application'
 'applied' 'apply' 'applying' 'approach' 'approximate' 'approximation'
 'arbitrary' 'architecture' 'area' 'art' 'artificial' 'aspect'
 'associated' 'assumption' 'attack' 'attempt' 'attention' 'attribute'
 'augmentation' 'augmented' 'auto' 'autoencoder' 'automated' 'automatic'
 'automatically' 'autonomous' 'auxiliary' 'available' 'average' 'aware'
 'backbone' 'background' 'base' 'based' 'baseline' 'batch' 'bayesian'
