In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
# preprocess the text data
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)

In [4]:
# Apply LDA
num_topics = 10  # Adjust the number of topics as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

In [5]:
n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")

# Assign topics to documents
topic_assignments = lda.transform(X)

# Print the topics assigned to a few example documents
for i in range(5):
    top_topic = topic_assignments[i].argmax()
    print(f"Document #{i + 1} is assigned to Topic #{top_topic + 1}")

Topic #1: edu, space, com, mail, available, information, ftp, use, data, list
Topic #2: israel, health, research, medical, israeli, 1993, study, national, years, april
Topic #3: god, people, jesus, does, believe, say, know, bible, christian, church
Topic #4: just, like, don, know, good, time, think, ve, got, really
Topic #5: armenian, people, armenians, turkish, jews, said, war, russian, history, turkey
Topic #6: ax, max, g9v, b8f, a86, pl, 145, 1d9, db, 34u
Topic #7: key, chip, encryption, clipper, keys, use, security, phone, government, privacy
Topic #8: people, don, think, right, just, like, know, make, government, mr
Topic #9: windows, use, dos, drive, file, does, card, thanks, software, pc
Topic #10: 00, 10, 25, 20, 15, 12, 11, 16, 17, 50
Document #1 is assigned to Topic #4
Document #2 is assigned to Topic #9
Document #3 is assigned to Topic #5
Document #4 is assigned to Topic #9
Document #5 is assigned to Topic #9
