In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.manifold import TSNE
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Load the preprocessed data
categories = ["sci.space","talk.politics.guns","rec.motorcycles", "comp.graphics","misc.forsale"]
groups = fetch_20newsgroups(categories = categories)
labels = groups.target
label_names = groups.target_names



In [2]:
# Use the vectorizer to fit the data
tfidf = TfidfVectorizer(stop_words="english", max_features=500)
data = tfidf.fit_transform(groups.data)
# Fit the data
t = 5
nmf = NMF(n_components=t, random_state=42)
nmf.fit(data)
terms = tfidf.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic {}:" .format(topic_idx))
    print(" ".join([terms[i] for i in topic.argsort()[-10:]]))

Topic 0:
lines organization hp ca sun dod article writes bike com
Topic 1:
cs organization lines subject host nntp posting university sale edu
Topic 2:
henry shuttle jpl orbit access alaska moon gov nasa space
Topic 3:
like weapons firearms just think edu don guns people gun
Topic 4:
thanks version windows file image files ca ac graphics uk


In [3]:
# Use the count vectorizer to fit the data
count = CountVectorizer(stop_words="english", max_features=500)
data = count.fit_transform(groups.data)
# Fit the data
t = 5
lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42)
lda.fit(data)
terms = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic {}:" .format(topic_idx))
    print(" ".join([terms[i] for i in topic.argsort()[-10:]]))

Topic 0:
version use university software image organization graphics subject lines edu
Topic 1:
20 new 15 50 data file nasa 10 00 space
Topic 2:
firearms weapons law think don guns com edu people gun
Topic 3:
host nntp posting article writes organization lines subject com edu
Topic 4:
just article writes lines gov subject organization nasa space edu
