In [3]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [4]:
n_samples = 2000
n_features = 10000
n_topics = 10
n_top_words = 20


In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [9]:
print("Loading dataset...")
#t0 = time()
from xml.etree import ElementTree
tree = ElementTree.parse(".\\Simplex\\train.xml")
page = tree.findall(".//page")
dataset=[]
print (tree)
for page_tag in tree.findall('page'):
    dataset.append(ElementTree.tostring(page_tag, encoding='utf8', method='xml'))


print (len(dataset))
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
#t0 = time()
tf = tf_vectorizer.fit_transform(dataset)
#print("done in %0.3fs." % (time() - t0))

Loading dataset...
<xml.etree.ElementTree.ElementTree object at 0x0000022BBC0AE7B8>
0
Extracting tf features for LDA...


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print (tf_feature_names)
print_top_words(lda, tf_feature_names, n_top_words)