# Initial Setup

In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
PATH="/diskA/jethro/nips-papers"

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
from utils.timeit import timeit

In [6]:
import pandas as pd

# Loading the Dictionary and Corpus

In [7]:
from gensim.corpora import Dictionary
from gensim.corpora.mmcorpus import MmCorpus

2018-03-10 17:14:32,153 : INFO : 'pattern' package not found; tag filters are not available for English


In [8]:
dct = Dictionary.load(f'{PATH}/dictionary.pkl')

2018-03-10 17:14:33,041 : INFO : loading Dictionary object from /diskA/jethro/nips-papers/dictionary.pkl
2018-03-10 17:14:33,056 : INFO : loaded /diskA/jethro/nips-papers/dictionary.pkl


In [9]:
corpus = MmCorpus(f'{PATH}/corpus.mm')

2018-03-10 17:14:33,432 : INFO : loaded corpus index from /diskA/jethro/nips-papers/corpus.mm.index
2018-03-10 17:14:33,432 : INFO : initializing corpus reader from /diskA/jethro/nips-papers/corpus.mm
2018-03-10 17:14:33,433 : INFO : accepted corpus with 7241 documents, 54254 features, 2350382 non-zero entries


In [10]:
import pickle
with open(f'{PATH}/timeseq.lst', 'rb') as f:
    time_seq = pickle.load(f)

In [13]:
from subprocess import call
import os.path

if not os.path.isfile("dtm-linux64"):
    call(["wget", "https://github.com/magsilva/dtm/raw/master/bin/dtm-linux64"])
    call(["chmod", "+x", "dtm-linux64"])


In [13]:
DTM_EXECUTABLE = "./dtm-linux64"

# Training the Model

In [14]:
from gensim.models.wrappers import DtmModel

In [15]:
NUM_TOPICS = 30

In [18]:
@timeit
def train_model(corpus, dct):
    return DtmModel(DTM_EXECUTABLE, corpus, id2word=dct, time_slices=time_seq, num_topics=NUM_TOPICS)

In [None]:
dtm = train_model(corpus, dct)

2018-03-10 16:44:51,651 : INFO : serializing temporary corpus to /tmp/42663e_train-mult.dat
2018-03-10 16:44:51,652 : INFO : no word id mapping provided; initializing from corpus
2018-03-10 16:44:55,098 : INFO : storing corpus in Blei's LDA-C format into /tmp/42663e_train-mult.dat
2018-03-10 16:44:58,864 : INFO : saving vocabulary of 54254 words to /tmp/42663e_train-mult.dat.vocab
2018-03-10 16:44:58,909 : INFO : training DTM with args --ntopics=30 --model=dtm  --mode=fit --initialize_lda=true --corpus_prefix=/tmp/42663e_train --outname=/tmp/42663e_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6  --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 
2018-03-10 16:44:58,909 : INFO : Running command ['./dtm-linux64', '--ntopics=30', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/42663e_train', '--outname=/tmp/42663e_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--

# Visualizing the Results

In [None]:
topics = dtm.show_topics(formatted=False, num_words=6, num_topics=NUM_TOPICS)
d = dict()
for topic_id, words in topics:
    d[f'topic_{topic_id}'] = [word for rank, (word, prob) in enumerate(words)]

In [None]:
pd.DataFrame(d).transpose()