In [2]:
# Updated sample text data with "technology" and "sports" themes
documents = ["Smartphones are an essential piece of technology in today's digital age.", "Professional athletes train rigorously to excel in their respective sports.", "Electric vehicles represent a significant advancement in automotive technology.", "Watching sports events live offers an exhilarating experience for fans.", "Virtual reality technology immerses users in simulated environments for gaming and entertainment.", "Sports equipment manufacturers continually innovate to improve performance and safety.", "Artificial intelligence is being integrated into various aspects of modern technology.", "Participating in sports promotes physical fitness and overall wellbeing."]

Topic modelling using lda


In [3]:
from sklearn.feature_extraction.text import CountVectorizer #to convert text to numerical
from sklearn.decomposition import LatentDirichletAllocation 
#Our goal with this code is to classify the text in the documents into different topics. We can accomplish that using LatentDirichletAllocation.

In [4]:
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(documents)

In [68]:
# Apply LDA
from pyparsing import alphanums


lda = LatentDirichletAllocation(learning_decay=0.65, learning_offset= 10, n_components= 2) #Assuming there are 2 topics
lda.fit(X)

# Assign each document to the topic with the highest probability
topic_assignments = lda.transform(X).argmax(axis=1)

# Group documents by their assigned topics
topic_documents = {'Sports': [], 'Technology': []}
for i, topic_idx in enumerate(topic_assignments):
 topic = 'Technology' if topic_idx == 0 else 'Sports'
 topic_documents[topic].append(documents[i])

In [69]:
# Print documents grouped by topics
for topic, docs in topic_documents.items():
  print(f"{topic}:")
  for doc in docs:
    print("-", doc)
  print()

Sports:
- Professional athletes train rigorously to excel in their respective sports.
- Electric vehicles represent a significant advancement in automotive technology.
- Watching sports events live offers an exhilarating experience for fans.
- Virtual reality technology immerses users in simulated environments for gaming and entertainment.
- Artificial intelligence is being integrated into various aspects of modern technology.

Technology:
- Smartphones are an essential piece of technology in today's digital age.
- Sports equipment manufacturers continually innovate to improve performance and safety.
- Participating in sports promotes physical fitness and overall wellbeing.



In [48]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
search_params = {
    'n_components': [5, 10, 15],
    'learning_decay': [0.5, 0.7, 0.9],  # equivalent to beta
    'learning_offset': [10, 50, 100]    # equivalent to alpha
}
model = LatentDirichletAllocation(random_state=42)
gridsearch = GridSearchCV(model, param_grid=search_params, cv=3)

# Fit GridSearchCV
gridsearch.fit(X)

# Display the best parameters
print("Best Params:", gridsearch.best_params_)

# Train the best LDA model
best_lda = gridsearch.best_estimator_

# Display the topics from the best model
for i, topic in enumerate(best_lda.components_):
    print(f"Top 10 words for topic #{i}:")
    print([vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])
    print("\n")

Best Params: {'learning_decay': 0.5, 'learning_offset': 10, 'n_components': 5}
Top 10 words for topic #0:
['intelligence', 'integrated', 'aspects', 'significant', 'represent', 'automotive', 'electric', 'vehicles', 'advancement', 'technology']


Top 10 words for topic #1:
['sports', 'virtual', 'users', 'gaming', 'simulated', 'entertainment', 'environments', 'immerses', 'reality', 'technology']


Top 10 words for topic #2:
['experience', 'exhilarating', 'performance', 'continually', 'equipment', 'manufacturers', 'innovate', 'safety', 'improve', 'sports']


Top 10 words for topic #3:
['essential', 'age', 'rigorously', 'professional', 'train', 'excel', 'respective', 'athletes', 'technology', 'sports']


Top 10 words for topic #4:
['essential', 'age', 'rigorously', 'professional', 'train', 'excel', 'respective', 'athletes', 'technology', 'sports']


