# Initial Setup

In [2]:
import sys
sys.path.insert(0, "../..")

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
PATH="/diskA/jethro/nips-papers"

In [5]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
from utils.timeit import timeit

In [7]:
import pandas as pd

# Loading the Dictionary and Corpus

In [8]:
from gensim.corpora import Dictionary
from gensim.corpora.mmcorpus import MmCorpus

2018-03-11 14:28:30,220 : INFO : 'pattern' package not found; tag filters are not available for English


In [9]:
dct = Dictionary.load(f'{PATH}/dictionary.pkl')

2018-03-11 14:28:30,249 : INFO : loading Dictionary object from /diskA/jethro/nips-papers/dictionary.pkl
2018-03-11 14:28:30,273 : INFO : loaded /diskA/jethro/nips-papers/dictionary.pkl


In [10]:
corpus = MmCorpus(f'{PATH}/corpus.mm')

2018-03-11 14:28:30,485 : INFO : loaded corpus index from /diskA/jethro/nips-papers/corpus.mm.index
2018-03-11 14:28:30,485 : INFO : initializing corpus reader from /diskA/jethro/nips-papers/corpus.mm
2018-03-11 14:28:30,486 : INFO : accepted corpus with 7241 documents, 54254 features, 2350382 non-zero entries


In [11]:
import pickle
with open(f'{PATH}/timeseq.lst', 'rb') as f:
    time_seq = pickle.load(f)

In [12]:
from subprocess import call
import os.path

if not os.path.isfile("dtm-linux64"):
    call(["wget", "https://github.com/magsilva/dtm/raw/master/bin/dtm-linux64"])
    call(["chmod", "+x", "dtm-linux64"])


In [13]:
DTM_EXECUTABLE = "./dtm-linux64"

# Training the Model

In [14]:
from gensim.models.wrappers import DtmModel

In [14]:
NUM_TOPICS = 30

In [15]:
@timeit
def train_model(corpus, dct):
    return DtmModel(DTM_EXECUTABLE, corpus, id2word=dct, time_slices=time_seq, num_topics=NUM_TOPICS)

In [16]:
dtm = train_model(corpus, dct)

2018-03-10 22:00:09,485 : INFO : serializing temporary corpus to /tmp/70040e_train-mult.dat
2018-03-10 22:00:09,485 : INFO : no word id mapping provided; initializing from corpus
2018-03-10 22:00:12,935 : INFO : storing corpus in Blei's LDA-C format into /tmp/70040e_train-mult.dat
2018-03-10 22:00:16,735 : INFO : saving vocabulary of 54254 words to /tmp/70040e_train-mult.dat.vocab
2018-03-10 22:00:16,778 : INFO : training DTM with args --ntopics=30 --model=dtm  --mode=fit --initialize_lda=true --corpus_prefix=/tmp/70040e_train --outname=/tmp/70040e_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6  --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 
2018-03-10 22:00:16,778 : INFO : Running command ['./dtm-linux64', '--ntopics=30', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/70040e_train', '--outname=/tmp/70040e_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--

In [17]:
dtm.save(f'{PATH}/dtm.gensim')

2018-03-11 01:54:13,873 : INFO : saving DtmModel object under /diskA/jethro/nips-papers/dtm.gensim, separately None
2018-03-11 01:54:13,873 : INFO : storing np array 'lambda_' to /diskA/jethro/nips-papers/dtm.gensim.lambda_.npy
2018-03-11 01:54:14,707 : INFO : storing np array 'obs_' to /diskA/jethro/nips-papers/dtm.gensim.obs_.npy
2018-03-11 01:54:15,566 : INFO : saved /diskA/jethro/nips-papers/dtm.gensim


# Visualizing the Results

In [46]:
NUM_TIMES=3

In [15]:
dtm = DtmModel.load(f'{PATH}/dtm.gensim')

2018-03-11 14:28:42,676 : INFO : loading DtmModel object from /diskA/jethro/nips-papers/dtm.gensim
2018-03-11 14:28:43,972 : INFO : loading id2word recursively from /diskA/jethro/nips-papers/dtm.gensim.id2word.* with mmap=None
2018-03-11 14:28:43,972 : INFO : loading lambda_ from /diskA/jethro/nips-papers/dtm.gensim.lambda_.npy with mmap=None
2018-03-11 14:28:55,289 : INFO : loading obs_ from /diskA/jethro/nips-papers/dtm.gensim.obs_.npy with mmap=None
2018-03-11 14:29:02,193 : INFO : loaded /diskA/jethro/nips-papers/dtm.gensim


In [44]:
topics = dtm.show_topics(formatted=False, num_words=6, num_topics=-1, times=NUM_TIMES)



In [24]:
from utils.array import chunks

In [83]:
def slice_to_dict(slce):
    d = dict()
    for topic_id, words in enumerate(slce):
        d[f'topic_{topic_id}'] = [word for rank, (prob, word) in enumerate(words)]
    return d

In [84]:
slice0, slice1, slice2 = chunks(topics, 30)

In [85]:
pd.DataFrame(slice_to_dict(slice0)).transpose()

Unnamed: 0,0,1,2,3,4,5
topic_0,node,tree,graph,message,path,link
topic_1,operator,rbf,kernel,regression,spline,product
topic_10,rule,symbol,grammar,string,generalization,population
topic_11,curve,expression,eigenvalue,eigenvectors,gene,patient
topic_12,classifier,classification,pattern,decision,label,tree
topic_13,speech,recognition,signal,word,speaker,phoneme
topic_14,region,group,gamma,mixture,event,component
topic_15,code,transformation,rotation,translation,digit,invariance
topic_16,neuron,memory,circuit,chip,analog,connection
topic_17,prediction,risk,loss,predictor,minimization,hypothesis


In [86]:
pd.DataFrame(slice_to_dict(slice1)).transpose()

Unnamed: 0,0,1,2,3,4,5
topic_0,node,tree,graph,message,path,edge
topic_1,operator,rbf,kernel,regression,spline,product
topic_10,rule,symbol,grammar,string,generalization,population
topic_11,curve,expression,eigenvalue,eigenvectors,gene,patient
topic_12,classifier,classification,pattern,decision,label,tree
topic_13,speech,recognition,signal,word,speaker,phoneme
topic_14,region,group,gamma,mixture,event,component
topic_15,code,transformation,rotation,translation,digit,invariance
topic_16,neuron,memory,circuit,chip,analog,voltage
topic_17,prediction,risk,loss,predictor,minimization,hypothesis


In [87]:
pd.DataFrame(slice_to_dict(slice2)).transpose()

Unnamed: 0,0,1,2,3,4,5
topic_0,node,tree,graph,path,message,edge
topic_1,operator,rbf,kernel,regression,spline,product
topic_10,rule,symbol,grammar,string,generalization,knowledge
topic_11,curve,expression,eigenvalue,eigenvectors,gene,patient
topic_12,classifier,classification,pattern,decision,label,accuracy
topic_13,speech,recognition,signal,word,speaker,phoneme
topic_14,region,group,gamma,mixture,event,component
topic_15,code,transformation,rotation,digit,translation,invariance
topic_16,memory,neuron,circuit,chip,analog,voltage
topic_17,prediction,risk,loss,predictor,minimization,hypothesis


In [94]:
d = dict()
for idx, chunk in enumerate(chunks(topics, NUM_TOPICS)):
    for topic_id, words in enumerate(chunk):
        d[f'topic_{topic_id}_slice_{idx}'] = [word for rank, (prob, word) in enumerate(words)]

In [98]:
pd.DataFrame(d).transpose()[:30]

Unnamed: 0,0,1,2,3,4,5
topic_0_slice_0,node,tree,graph,message,path,link
topic_0_slice_1,node,tree,graph,message,path,edge
topic_0_slice_2,node,tree,graph,path,message,edge
topic_10_slice_0,rule,symbol,grammar,string,generalization,population
topic_10_slice_1,rule,symbol,grammar,string,generalization,population
topic_10_slice_2,rule,symbol,grammar,string,generalization,knowledge
topic_11_slice_0,curve,expression,eigenvalue,eigenvectors,gene,patient
topic_11_slice_1,curve,expression,eigenvalue,eigenvectors,gene,patient
topic_11_slice_2,curve,expression,eigenvalue,eigenvectors,gene,patient
topic_12_slice_0,classifier,classification,pattern,decision,label,tree


In [40]:
topics = dtm.show_topics(formatted=False, num_words=10, num_topics=1, times=10)



In [44]:
d = dict()
for idx, chunk in enumerate(chunks(topics, 10)):
    for slice_id, words in enumerate(chunk):
        d[f'slice_{slice_id}'] = [word for rank, (prob, word) in enumerate(words)]
pd.DataFrame(d).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
slice_0,node,tree,graph,message,path,link,edge,cycle,branch,parent
slice_1,node,tree,graph,message,path,edge,link,cycle,branch,parent
slice_2,node,tree,graph,path,message,edge,link,cycle,branch,parent
slice_3,node,tree,graph,path,message,edge,link,cycle,parent,branch
slice_4,node,tree,graph,path,edge,message,link,parent,cycle,level
slice_5,node,tree,graph,path,edge,message,link,parent,level,leaf
slice_6,node,tree,graph,path,edge,message,link,parent,leaf,level
slice_7,node,tree,graph,path,edge,message,link,parent,leaf,propagation
slice_8,node,tree,graph,path,edge,message,parent,propagation,belief,link
slice_9,node,tree,graph,path,edge,belief,message,propagation,parent,leaf
