In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd

In [10]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [11]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [12]:
data = list(pd.read_csv("./data/dataset.csv", encoding="utf_8")["Body"])

In [13]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data)

In [14]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data)

In [15]:
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

In [16]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0: question answer questions answers topic think opinion ask caffeine like based don coffee good types better community know asked research
Topic #1: coffee iced grinding brewing people beans like idea variation love bean advice yes questions consuming glass shop drink seasoned talking
Topic #2: tag tags drip use wiki posts equipment moka pour related process uk word having brew synonym meta filter machines method
Topic #3: site questions beta stack users sites new exchange need community people just meta ve post like good edit time make
Topic #4: product recommendations recommendation spam topic specific certain bad products based post hard subjective want personal problem just asker spammers espresso
Topic #5: knowledge se coffee yes useful serving official 2015 suggestions say related challenges traffic getting cw help fact famous fairly fair
Topic #6: hats winter bash earn public viewed click soon link forward opt year network user improve available secret virtual certain pa

In [17]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

In [18]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0: questions question coffee answers answer topic community think site good caffeine ask like asked does related sources don people health
Topic #1: coffee knowledge grinding iced se beans shop yes bean love need grind original flavored related ground drink grinder talking coffees
Topic #2: tag tags drip use different wiki flavor having equipment machine pour synonym specific machines method uk filter descaler luwak question
Topic #3: site meta ve users time new edit post sites people se getting participation help beta community looking traffic need ads
Topic #4: product questions recommendations opinion based recommendation site don topic thing subjective just asking flavor types open specific want spam post
Topic #5: answer bounty just yes high better se type work look information rest feel long share machine challenges cw fine job
Topic #6: hats stack exchange like posts sorry public beta question user coffee flags users bash winter tried click tasks issue year
Topic #7: brew

In [19]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [20]:
lda.fit(tf)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, random_state=0)

In [21]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: coffee question questions site answer think like answers topic se don just community people tag new good make sites know
Topic #1: chat room community members area 51 moderators created proposal percolator site don stackexchange beverages rooms main https com barista learning
Topic #2: grounds questions grinds correct spent refer left terminology make chat sure seen question london brewing site person discussing vote isn
Topic #3: question grind text roast community sources generally future spurred products case somewhat color light search potentially communities let grounds users
Topic #4: grind coffee water grounds percolator moka grinding like beans espresso separate question brewed storage disclaimer stovetop awesome think beginning grinds
Topic #5: question answer coffee questions know think don bounty example votes like topic caffeine half related closed extent zero didn wikis
Topic #6: answer question hats user site posts users stack exchange questions queue like tasks