In [4]:
%matplotlib inline


# Topic extraction with Tensor LDA


This example is modified from scikit-learn's "Topic extraction with
Non-negative Matrix Factorization and Latent Dirichlet Allocation"
example.

This example applies :class:`tensor_lda.tensor_lda.TensorLDA`
on the 20 news group dataset and the output is a list of topics, each
represented as a list of terms (weights are not shown).





In [5]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups

from tensor_lda.tensor_lda import TensorLDA

n_samples = 10000
n_features = 1000
n_components = 40
n_top_words = 10


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        topic_prior = model.alpha_[topic_idx]
        message = "Topic #%d (prior: %.3f): " % (topic_idx, topic_prior)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=2,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=5,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

print("Fitting TensorLDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

lda = TensorLDA(n_components=n_components, alpha0=.1)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

doc_topics = lda.transform(tf[0:2, :])
print(doc_topics[0, :])
print(data_samples[0])

Loading dataset...
done in 1.131s.
Extracting tf features for LDA...
done in 1.175s.

Fitting TensorLDA models with tf features, n_samples=10000 and n_features=1000...
done in 42.296s.

Topics in LDA model:
Topic #0 (prior: 0.000): men women 1993 oh april say said child life thing
Topic #1 (prior: 0.000): ax max b8f g9v pl bhj giz wm bxn sl
Topic #2 (prior: 0.000): 00 new sale 01 thank 20 50 30 shipping dos
Topic #3 (prior: 0.000): window use way windows place card program problem display does
Topic #4 (prior: 0.000): groups new group need israel doesn program windows jewish mail
Topic #5 (prior: 0.000): com try list article said dave think david source sun
Topic #6 (prior: 0.000): edu article soon israel university probably com news question uk
Topic #7 (prior: 0.001): address does send mail email ve thanks post looking uk
Topic #8 (prior: 0.001): car need like really cars looks worth makes guess end
Topic #9 (prior: 0.001): key keys chip use public number know bit used using
Topic #1

In [6]:
tf

<10000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 254723 stored elements in Compressed Sparse Row format>

In [17]:
len(dataset['data'][0])

426