In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def display_topic(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1: -1]]))

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [12]:
print(len(documents))

11314


In [9]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50, random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=20, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [11]:
print(tf)

  (0, 703)	1
  (0, 422)	1
  (0, 502)	1
  (0, 554)	1
  (0, 146)	1
  (0, 573)	1
  (0, 424)	1
  (0, 748)	1
  (0, 762)	1
  (0, 229)	1
  (0, 760)	1
  (0, 745)	1
  (0, 897)	1
  (0, 914)	1
  (0, 535)	1
  (0, 444)	1
  (0, 985)	1
  (0, 497)	2
  (0, 714)	1
  (0, 589)	4
  (0, 854)	1
  (0, 303)	1
  (0, 860)	1
  (0, 872)	1
  (1, 321)	1
  :	:
  (11313, 758)	1
  (11313, 747)	1
  (11313, 840)	1
  (11313, 637)	1
  (11313, 210)	1
  (11313, 560)	1
  (11313, 646)	2
  (11313, 529)	1
  (11313, 421)	3
  (11313, 550)	2
  (11313, 773)	1
  (11313, 995)	1
  (11313, 434)	1
  (11313, 931)	1
  (11313, 191)	1
  (11313, 207)	1
  (11313, 954)	1
  (11313, 516)	1
  (11313, 540)	1
  (11313, 321)	1
  (11313, 508)	2
  (11313, 88)	1
  (11313, 359)	1
  (11313, 422)	1
  (11313, 303)	1
