In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [3]:
import numpy as np

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

# Read Documents

Simple Wiki's abstracts

In [5]:
all_docs = !xmllint --shell /Users/jencirlee/Workbench/data/simplewiki-latest-abstract.xml.gz <<< "cat //doc/abstract"

In [6]:
all_docs = all_docs[1::2]
len(all_docs)

130870

# Topic Modelling

In [7]:
n_features = 20000
n_components = 10

In [9]:
def print_top_words_factors(factors, feature_names, n_top_words):
    ''' Print top words of each topic '''
    for topic_idx, topic in enumerate(factors):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def print_top_words(model, feature_names, n_top_words):
    ''' Print top words of each topic '''
    print_top_words_factors(model.components_, feature_names, n_top_words)

## NMF

In [46]:
tfidf_vectorizer = TfidfVectorizer(max_df=.95, min_df=2,
                                   max_features=n_features,
                                   lowercase=False,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(all_docs)

In [47]:
docs = tfidf_vectorizer.transform([s for s in all_docs
                                   if 'China' in s])
docs.shape

(616, 20000)

In [48]:
nmf = NMF(n_components=20, init='random', alpha=.1, l1_ratio=1e-6)
nmf.fit(docs)

NMF(alpha=0.1, beta_loss='frobenius', init='random', l1_ratio=1e-06,
  max_iter=200, n_components=20, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [49]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), n_top_words=20)

Topic #0: China birth_place Guangzhou national Hebei Guangdong team Tianjin The Hunan Jilin death_place states Hainan Wuhan list noted thumb football It
Topic #1: Chinese China Traditional Pinyin Simplified traditional born Year ethnic Mandarin He national The Li chess Hakka people vase association Northeast
Topic #2: Taiwan China Republic Taipei Taoyuan mainland Typhoon City Philippines typhoon including Japan islands damage cities people Kaohsiung caused countries parade
Topic #3: Henan prefecture level city China province spelled lived 2010 northern In eastern central people western southern It Its 707 Eagle
Topic #4: Hong Kong mosque Mosque China Islamic Kowloon Mainland Wan tall floors It skyscraper meters Centre feet Macau center parts Territories
Topic #5: Xi Shaanxi pinyin Qū Xiàn China District county district County It Municipality center meaning southeast middle south Province east eastern
Topic #6: Liaoning birth_place northeastern port Peninsula important Pinyin located La

## Latent Dirichlet Allocation

Variational Inference for LDA

In [10]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
                                max_features=n_features,
                                lowercase=False,
                                stop_words='english')
tf_vectorizer.fit(all_docs)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.95, max_features=20000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [55]:
docs = tf_vectorizer.transform([s for s in all_docs
                                if 'China' in s])
docs.shape

(616, 20000)

In [70]:
lda = LatentDirichletAllocation(n_components=20,
                                doc_topic_prior=.25,
                                topic_word_prior=.5,
                                max_iter=20,
                                learning_method='online',
                                learning_offset=50.)
lda.fit(docs)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.25,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_components=20, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=0.5,
             total_samples=1000000.0, verbose=0)

In [71]:
print_top_words(lda, tf_vectorizer.get_feature_names(), n_top_words=20)

Topic #0: China The It Chinese Republic province People city known Taiwan birth_place He people level born Beijing Province genus county Anhui
Topic #1: pear Tian China type It countries Japan Gate named Forbidden death food northern plaza Ya Chinese Beijing Test nearest north
Topic #2: BC Soviet China Union 195 Han Cold He Republic Dynasty time Sino War ruled Pay 256 connects Slovenian founder relations
Topic #3: China Japanese He The United States Emperor Chinese War It Sea years Manga Dynasty Han Japan Ming ancient North Liu
Topic #4: China The stars star workers big June wall soldiers Square small background protests citizens Flag countries communist powdered ancient 183
Topic #5: China Silk trade Mediterranean east Asia way belief thinking deaths genus The old tree species This Middle Sea weaving southeast
Topic #6: China Taiwan The BC Asian Japan Russian countries Chinese birth_place Trans Siberian Republic Moscow East It dynasty Vietnam covers European
Topic #7: China The dynast

## Spectral LDA

In [12]:
import sys
sys.path.insert(0, '../SpectralLDA')

from spectral_lda import spectral_lda

Using mxnet backend.
  from ._conv import register_converters as _register_converters
Using numpy backend.


In [75]:
alpha, beta = spectral_lda(docs, 5, 20, eps=.80)

# docs: 616	# valid: 600


RuntimeError: Invalid results from CPDecomp.

In [73]:
alpha, alpha.sum()

(array([0.05925362, 0.18486831, 0.77896131, 0.14335609, 1.14590444,
        0.08582096, 0.49797297, 0.37264371, 0.37460232, 0.1789797 ,
        0.17568921, 0.02837142, 0.14877573, 0.12546624, 0.09682436,
        0.05977945, 0.17551349, 0.00987483, 0.00896048, 0.0025466 ]),
 4.654165257546078)

In [61]:
print(beta.T[0].max())

0.15638433987404624


In [62]:
print_top_words_factors(beta.T, tf_vectorizer.get_feature_names(), 20)

Topic #0: China province Republic The It capital People city Province Zhejiang County Henan east southeast Shandong Jiangxi called Municipality coast largest
Topic #1: China birth_place Shanghai Beijing death_place Liaoning District Origin Republic Anhui Guangzhou Hunan Tianjin Settlement Hainan Jilin Guangdong Jiangxi International train
Topic #2: Republic Shanghai Chinese Anhui It thumb Liaoning known Japan residence species list Jilin Xi Sea Korea born southeast Li Jiangsu
Topic #3: city county national district University team football born birth_place Dynasty Shanghai Chinese dynasty National situated Nanjing Kong He Hong tall
Topic #4: China Beijing death_place University university Origin Normal normal Hebei prestigious It universities Nanjing 1898 museum prefecture created Summer symbol oldest
Topic #5: Beijing It Chinese birth_place India Wuhan He Hebei Nanjing birthplace National Japan city mountain Dynasty Communist province dynasty Party BC
Topic #6: China dynasty The This 