# Inititial Setup

In [18]:
import sys
sys.path.insert(0, "../..")

In [19]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
PATH="/diskA/jethro/nips-papers"

In [21]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [22]:
import pandas as pd

In [23]:
from utils.timeit import timeit

# Loading Dictionary and Corpus

In [24]:
from gensim.corpora import Dictionary
from gensim.corpora.mmcorpus import MmCorpus

In [25]:
dct = Dictionary.load(f'{PATH}/dictionary.pkl')

2018-03-10 21:53:05,706 : INFO : loading Dictionary object from /diskA/jethro/nips-papers/dictionary.pkl
2018-03-10 21:53:05,719 : INFO : loaded /diskA/jethro/nips-papers/dictionary.pkl


In [26]:
corpus = MmCorpus(f'{PATH}/corpus.mm')

2018-03-10 21:53:05,768 : INFO : loaded corpus index from /diskA/jethro/nips-papers/corpus.mm.index
2018-03-10 21:53:05,769 : INFO : initializing corpus reader from /diskA/jethro/nips-papers/corpus.mm
2018-03-10 21:53:05,770 : INFO : accepted corpus with 7241 documents, 54254 features, 2350382 non-zero entries


# Training the Model

In [1]:
NUM_TOPICS=30

In [2]:
NUM_PASSES=200

In [28]:
CHUNKSIZE=1500

In [29]:
NUM_ITERATIONS=200

In [30]:
from gensim.models.ldamodel import LdaModel

In [31]:
@timeit
def train_model(corpus, dct):
    return LdaModel(corpus, id2word=dct, num_topics=NUM_TOPICS, chunksize=CHUNKSIZE, iterations=NUM_ITERATIONS, alpha='auto') 

In [32]:
lda = train_model(corpus, dct) 

2018-03-10 21:53:07,168 : INFO : using autotuned alpha, starting with [0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335]
2018-03-10 21:53:07,169 : INFO : using symmetric eta at 0.03333333333333333
2018-03-10 21:53:07,179 : INFO : using serial LDA version on this node
2018-03-10 21:53:21,665 : INFO : running online (single-pass) LDA training, 30 topics, 1 passes over the supplied corpus of 7241 documents, updating model once every 500 documents, evaluating perplexity every 5000 documents, iterating 200x with a convergence threshold of 0.001000
2018-03-10 21:53:21,846 : INFO : PROGRESS: pass 0, at document #500/7241
2018-03-10 21:53:24,904 : INFO : optim

In [33]:
lda.save(f'{PATH}/lda.gensim')

2018-03-10 21:54:07,260 : INFO : saving LdaState object under /diskA/jethro/nips-papers/lda.gensim.state, separately None
2018-03-10 21:54:07,293 : INFO : saved /diskA/jethro/nips-papers/lda.gensim.state
2018-03-10 21:54:07,316 : INFO : saving LdaModel object under /diskA/jethro/nips-papers/lda.gensim, separately ['expElogbeta', 'sstats']
2018-03-10 21:54:07,317 : INFO : storing np array 'expElogbeta' to /diskA/jethro/nips-papers/lda.gensim.expElogbeta.npy
2018-03-10 21:54:07,325 : INFO : not storing attribute dispatcher
2018-03-10 21:54:07,325 : INFO : not storing attribute state
2018-03-10 21:54:07,326 : INFO : not storing attribute id2word
2018-03-10 21:54:07,328 : INFO : saved /diskA/jethro/nips-papers/lda.gensim


# Visualizing the Results

In [35]:
topics = lda.show_topics(formatted=False, num_words=6, num_topics=NUM_TOPICS)
d = dict()
for topic_id, words in topics:
    d[f'topic_{topic_id}'] = [word for rank, (word, prob) in enumerate(words)]

In [36]:
pd.DataFrame(d).transpose()

Unnamed: 0,0,1,2,3,4,5
topic_0,policy,action,reward,agent,decision,regret
topic_1,layer,architecture,unit,activation,preprint,mnist
topic_10,bound,theorem,proof,log,lemma,complexity
topic_11,cluster,clustering,kmeans,gene,center,distance
topic_12,latent,gradient,tensor,sparsity,update,sparse
topic_13,decoder,code,price,option,correlation,spike
topic_14,event,particle,measurement,recovery,episode,signal
topic_15,face,detection,diffusion,frame,recognition,person
topic_16,classifier,optimization,query,search,score,accuracy
topic_17,component,noise,source,pca,causal,subspace
