# Initial Setup

In [20]:
import sys
sys.path.insert(0, "../..")

In [21]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
PATH="/diskA/jethro/nips-papers"

In [23]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [24]:
from utils.timeit import timeit

In [25]:
import pandas as pd

# Loading the Dictionary and Corpus

In [26]:
from gensim.corpora import Dictionary
from gensim.corpora.mmcorpus import MmCorpus

In [27]:
dct = Dictionary.load(f'{PATH}/dictionary.pkl')

2018-03-11 08:57:10,463 : INFO : loading Dictionary object from /diskA/jethro/nips-papers/dictionary.pkl
2018-03-11 08:57:10,477 : INFO : loaded /diskA/jethro/nips-papers/dictionary.pkl


In [28]:
corpus = MmCorpus(f'{PATH}/corpus.mm')

2018-03-11 08:57:11,474 : INFO : loaded corpus index from /diskA/jethro/nips-papers/corpus.mm.index
2018-03-11 08:57:11,474 : INFO : initializing corpus reader from /diskA/jethro/nips-papers/corpus.mm
2018-03-11 08:57:11,475 : INFO : accepted corpus with 7241 documents, 54254 features, 2350382 non-zero entries


In [29]:
import pickle
with open(f'{PATH}/timeseq.lst', 'rb') as f:
    time_seq = pickle.load(f)

In [11]:
from subprocess import call
import os.path

if not os.path.isfile("dtm-linux64"):
    call(["wget", "https://github.com/magsilva/dtm/raw/master/bin/dtm-linux64"])
    call(["chmod", "+x", "dtm-linux64"])


In [12]:
DTM_EXECUTABLE = "./dtm-linux64"

# Training the Model

In [30]:
from gensim.models.wrappers import DtmModel
from gensim.models import LdaSeqModel

In [14]:
NUM_TOPICS = 30

In [15]:
@timeit
def train_model(corpus, dct):
    return DtmModel(DTM_EXECUTABLE, corpus, id2word=dct, time_slices=time_seq, num_topics=NUM_TOPICS)

In [16]:
dtm = train_model(corpus, dct)

2018-03-10 22:00:09,485 : INFO : serializing temporary corpus to /tmp/70040e_train-mult.dat
2018-03-10 22:00:09,485 : INFO : no word id mapping provided; initializing from corpus
2018-03-10 22:00:12,935 : INFO : storing corpus in Blei's LDA-C format into /tmp/70040e_train-mult.dat
2018-03-10 22:00:16,735 : INFO : saving vocabulary of 54254 words to /tmp/70040e_train-mult.dat.vocab
2018-03-10 22:00:16,778 : INFO : training DTM with args --ntopics=30 --model=dtm  --mode=fit --initialize_lda=true --corpus_prefix=/tmp/70040e_train --outname=/tmp/70040e_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6  --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 
2018-03-10 22:00:16,778 : INFO : Running command ['./dtm-linux64', '--ntopics=30', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/70040e_train', '--outname=/tmp/70040e_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--

In [17]:
dtm.save(f'{PATH}/dtm.gensim')

2018-03-11 01:54:13,873 : INFO : saving DtmModel object under /diskA/jethro/nips-papers/dtm.gensim, separately None
2018-03-11 01:54:13,873 : INFO : storing np array 'lambda_' to /diskA/jethro/nips-papers/dtm.gensim.lambda_.npy
2018-03-11 01:54:14,707 : INFO : storing np array 'obs_' to /diskA/jethro/nips-papers/dtm.gensim.obs_.npy
2018-03-11 01:54:15,566 : INFO : saved /diskA/jethro/nips-papers/dtm.gensim


# Visualizing the Results

In [46]:
NUM_TIMES=3

In [32]:
dtm = DtmModel.load(f'{PATH}/dtm.gensim')

2018-03-11 08:57:39,415 : INFO : loading DtmModel object from /diskA/jethro/nips-papers/dtm.gensim
2018-03-11 08:57:39,553 : INFO : loading id2word recursively from /diskA/jethro/nips-papers/dtm.gensim.id2word.* with mmap=None
2018-03-11 08:57:39,553 : INFO : loading lambda_ from /diskA/jethro/nips-papers/dtm.gensim.lambda_.npy with mmap=None
2018-03-11 08:57:39,615 : INFO : loading obs_ from /diskA/jethro/nips-papers/dtm.gensim.obs_.npy with mmap=None
2018-03-11 08:57:39,677 : INFO : loaded /diskA/jethro/nips-papers/dtm.gensim


In [44]:
topics = dtm.show_topics(formatted=False, num_words=6, num_topics=-1, times=NUM_TIMES)



In [45]:
d = dict()
for topic_id, words in topics:
    d[f'topic_{topic_id}'] = [word for rank, (word, prob) in enumerate(words)]

90

In [19]:
# pd.DataFrame(d).transpose()