This is the sample code taken from the Sci-Kit Learn documentation, ["Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation"](http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py). All I have done here is replace the data, `from sklearn.datasets import fetch_20newsgroups`, with the Tedtalks data and removed all the timing functionality.

In [1]:
from time import time
import pandas
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_features = 5000
n_components = 40
n_top_words = 10
stopwords = re.split('\s+', open('../data/stopwords_all.txt', 'r').read().lower())


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print("Loading dataset...")
t0 = time()
# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)
talks = df.text.tolist()
print("done in {:0.3f} seconds.".format((time() - t0)))


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words=stopwords)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(talks)
print("done in {:0.3f} seconds.".format((time() - t0)))


# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords)
t0 = time()
tf = tf_vectorizer.fit_transform(talks)
print("done in {:0.3f} seconds.".format((time() - t0)))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples={} and n_features={}...".format(len(talks), n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in {:0.3f} seconds.".format((time() - t0)))


print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


print("Fitting LDA models with tf features, "
      "{:d} and n_features={:d}...".format(len(talks), n_features))
lda = LatentDirichletAllocation(n_components=n_components, 
                                max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in {:0.3f} seconds.".format((time() - t0)))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 0.288 seconds.
Extracting tf-idf features for NMF...
done in 3.215 seconds.
Extracting tf features for LDA...
done in 3.156 seconds.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2069 and n_features=5000...
done in 4.889 seconds.

Topics in NMF model (Frobenius norm):
Topic #0: people just know think see time years life first back
Topic #1: bacteria molecule antibiotics molecules organisms bacterial viruses immune sensing tumors
Topic #2: percent countries money dollars growth economic economy people business billion
Topic #3: patients health patient care medical doctors hospital doctor surgery medicine
Topic #4: ice pole antarctica climate glacier snow south polar north expedition
Topic #5: universe galaxies galaxy space dark theory quantum physics higgs particles
Topic #6: women men woman gender pm violence female feminist equality stories
Topic #7: brain neurons brains cortex neuron activity arm neuroscience disorders memory
Topic

In [None]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples={} and n_features={}...".format(len(talks), n_features))

t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)