In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from summa import keywords

import requests

In [2]:
n_features = 1000
n_top_words = 2
n_components = 5

def read_text(url):
    txt = requests.get(url).text
    return [x.strip() for x in txt.split('\r\n') if x.strip() != '']

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def print_most_frequent(vec, bag_of_words):
    sum_words = bag_of_words .sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    print(words_freq[:n_components])
    print()

def get_tf_idf_data(txt, ngram_range=(1,1), print_frequencies=False):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, 
                                       min_df=2,
                                       ngram_range=ngram_range,
                                       max_features=n_features, 
                                       stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(txt)
    
    if print_frequencies:
        print_most_frequent(tfidf_vectorizer, tfidf)
    
    return (tfidf_vectorizer, tfidf)
    
def get_tf_data(txt, ngram_range=(1,1), print_frequencies=False):
    tf_vectorizer = CountVectorizer(max_df=0.95, 
                                    min_df=2,
                                    ngram_range=ngram_range,
                                    max_features=n_features,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(txt)
    
    if print_frequencies:
        print_most_frequent(tf_vectorizer, tf)
    
    return (tf_vectorizer, tf)

In [3]:
txt = read_text("http://www.glozman.com/TextPages/Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt")

### Term frequency

In [4]:
%%time

_ = get_tf_data(txt, (2,2), True)

[('said harry', 143), ('said ron', 111), ('uncle vernon', 108), ('professor mcgonagall', 95), ('said hagrid', 89)]

CPU times: user 192 ms, sys: 122 µs, total: 192 ms
Wall time: 442 ms


### TF-IDF

In [5]:
%%time

_ = get_tf_idf_data(txt, (2,2), True)

[('said harry', 87.16511421562596), ('said ron', 59.78064567759409), ('uncle vernon', 57.043435884277244), ('said hagrid', 51.44077989128559), ('professor mcgonagall', 49.124548569059336)]

CPU times: user 191 ms, sys: 18 µs, total: 191 ms
Wall time: 487 ms


### Non-Negative Matrix Factorization and Frobenius norm

In [6]:
%%time

(tf_idf_vectorizer, tfidf) = get_tf_idf_data(txt)
nmf = NMF(n_components=n_components, alpha=.1, l1_ratio=.5).fit(tfidf)
print_top_words(nmf, tf_idf_vectorizer.get_feature_names(), n_top_words)

Topic #0: said dumbledore
Topic #1: harry potter
Topic #2: ron hermione
Topic #3: hagrid looked
Topic #4: don know

CPU times: user 174 ms, sys: 1.21 ms, total: 175 ms
Wall time: 405 ms


### Non-Negative Matrix Factorization and Kullback-Leibler divergence

In [7]:
%%time

(tf_idf_vectorizer, tfidf) = get_tf_idf_data(txt)
nmf = NMF(n_components=n_components, 
          beta_loss='kullback-leibler', 
          solver='mu', 
          max_iter=1000, 
          alpha=.1,
          l1_ratio=.5).fit(tfidf)
print_top_words(nmf, tf_idf_vectorizer.get_feature_names(), n_top_words)

Topic #0: said harry
Topic #1: harry vernon
Topic #2: ron hermione
Topic #3: hagrid looked
Topic #4: know professor

CPU times: user 536 ms, sys: 379 µs, total: 536 ms
Wall time: 1.16 s


### Latent Dirichlet Allocation

In [8]:
%%time

(tf_vectorizer, tf) = get_tf_data(txt)
lda = LatentDirichletAllocation(n_components=n_components, 
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
print_top_words(lda, tf_vectorizer.get_feature_names(), n_top_words)

Topic #0: said know
Topic #1: harry uncle
Topic #2: harry looked
Topic #3: neville malfoy
Topic #4: ron hermione

CPU times: user 5.67 s, sys: 6.31 ms, total: 5.67 s
Wall time: 11.7 s


### TextRank

In [9]:
%%time

joined = '. '.join(txt)
kwd = keywords.keywords(joined, scores=True,)
print(kwd[:5])

[('harry', 0.5998025983119938), ('said', 0.3081132992291062), ('hagrid', 0.2109997818116515), ('ron', 0.20964010372737452), ('look', 0.15713284677327347)]
CPU times: user 1min 39s, sys: 3.21 s, total: 1min 42s
Wall time: 3min 21s
