# Create Dummy Textual Dataset

In [1]:

# Example documents
documents = [
    "I love programming in Python",
    "Python and Java are popular programming languages",
    "I enjoy learning new programming languages",
    "Machine learning is fascinating",
    "Deep learning and neural networks are part of machine learning",
    "Natural language processing (NLP) is a branch of artificial intelligence",
    "NLP techniques include tokenization, stemming, and lemmatization",
    "Supervised learning algorithms include regression and classification",
    "Unsupervised learning includes clustering and association",
    "Reinforcement learning involves agents learning from their environment"
]
documents

['I love programming in Python',
 'Python and Java are popular programming languages',
 'I enjoy learning new programming languages',
 'Machine learning is fascinating',
 'Deep learning and neural networks are part of machine learning',
 'Natural language processing (NLP) is a branch of artificial intelligence',
 'NLP techniques include tokenization, stemming, and lemmatization',
 'Supervised learning algorithms include regression and classification',
 'Unsupervised learning includes clustering and association',
 'Reinforcement learning involves agents learning from their environment']

# Preprocessing (Cleaning Text)

In [2]:
# Preprocessing Steps
# Lowercasing: Convert all characters to lowercase.
# Removing Punctuation: Remove punctuation marks.
# Removing Stopwords: Remove common stopwords like "and", "the", etc.
# Tokenization: Split text into individual words.
# Stemming/Lemmatization: Reduce words to their root form (optional).

#===============================================================================
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    # Join tokens back to string
    return ' '.join(tokens)

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_documents

['love programming python',
 'python java popular programming languages',
 'enjoy learning new programming languages',
 'machine learning fascinating',
 'deep learning neural networks machine learning',
 'natural language processing nlp branch artificial intelligence',
 'nlp techniques include tokenization stemming lemmatization',
 'supervised learning algorithms include regression classification',
 'unsupervised learning includes clustering association',
 'reinforcement learning involves agents learning environment']

# Countvectorizer (Text to numeric)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# Convert the documents to a term-document matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_documents)

In [4]:
# optional code 
# Convert the sparse matrix to a dense format
dense_matrix = X.todense()
dense_matrix

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
         1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 

# Apply LDE

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
# Fit the LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)

0,1,2
,n_components,2
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


# Display topics

In [6]:
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]])

Topic 0:
['learning', 'programming', 'languages', 'machine', 'python', 'neural', 'deep', 'networks', 'environment', 'involves']
Topic 1:
['nlp', 'include', 'processing', 'language', 'artificial', 'branch', 'intelligence', 'natural', 'lemmatization', 'tokenization']


# let's make it more representative

In [7]:
import pandas as pd

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[f"Topic {topic_idx+1}"] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

no_top_words = 10
topics = display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

# Convert the topics dictionary to a DataFrame for better visualization
topics_df = pd.DataFrame(topics)


topics_df

Unnamed: 0,Topic 1,Topic 2
0,learning,nlp
1,programming,include
2,languages,processing
3,machine,language
4,python,artificial
5,neural,branch
6,deep,intelligence
7,networks,natural
8,environment,lemmatization
9,involves,tokenization
