In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd
import numpy as np

In [2]:
n_samples = 2000
n_features = 1000
n_components = None
n_top_words = 20

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
data = list(pd.read_csv("./data/dataset.csv", encoding="utf_8")["Body"])

In [5]:
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
#                                    max_features=n_features,
#                                    stop_words='english')
# tfidf = tfidf_vectorizer.fit_transform(data)

In [6]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data)

In [7]:
# nmf = NMF(n_components=n_components, random_state=1,
#           alpha=.1, l1_ratio=.5).fit(tfidf)

In [8]:
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# print_top_words(nmf, tfidf_feature_names, n_top_words)

In [9]:
# nmf = NMF(n_components=n_components, random_state=1,
#           beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
#           l1_ratio=.5).fit(tfidf)

In [10]:
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# print_top_words(nmf, tfidf_feature_names, n_top_words)

In [11]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [12]:
lda.fit(tf)

TypeError: '<=' not supported between instances of 'NoneType' and 'int'

In [None]:
lda_var_ratios = lda.explained_variance_ratio_

In [None]:
# Create a function
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

In [None]:
# Run function
select_n_components(lda_var_ratios, 0.95)

In [58]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: coffee don flavor think tags questions topic want sites meta flavored hold little duplicate suggestions good needs just answers different
Topic #1: run reputation coffeemaker species site sa variation decide recommendations size proposals award improve removed beverage users including project excellent hoc_age
Topic #2: questions question site explanations ingredients stackexchange key possibly did matter merged happen app drug soon situation yes unless existing thinking
Topic #3: products uk just term japan drip spurred case future color main light communities edited refers kinds amortize coffee use links
Topic #4: topic question product apply bean expect opinion long answers recommendations answer specific bad se interface recommendation disclaimer storage flagged wait
Topic #5: differences growing zero line question coffee written worded does related criteria closed drug didn extent maintain suggested wikis list subject
Topic #6: site work given welcome time just rest sear

In [59]:
test = tf_vectorizer.transform(
    ["While answering a few of EdChum's questions I discovered that what I/we in the USA call pour over coffee is referred to as drip coffee in the UK. I added the pour-over tag to both questions I encountered but figured we should decide as a community which tag to use to describe this brewing process and then properly document it because drip-coffee means something different in the US (which is apparently referred to as filter-cofee in the UK). For clarification the method in question is shown in the image below. ",
    "Being newly created we have zero feeds appearing in our main chat right now. What blogs, news sites, or other important coffee related things should appear in our main chat room's feed? Post your suggestions/submissions.  "])

In [60]:
doc_topic_dist_unnormalized = np.matrix(lda.transform(test))

In [61]:
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [62]:
print(doc_topic_dist.argmax(axis=1))

[[74]
 [74]]


In [90]:
doc_topic_dist[1,74]

0.9478947368420876