In [2]:
from hlda.sampler import HierarchicalLDA
from util.file_loader import FileLoader
from pandas import DataFrame
import os
import numpy as np
from ipywidgets import widgets
from IPython.core.display import HTML, display

In [3]:
fl = FileLoader()
dir = "cpp_examples/assembly/"
files = os.listdir(dir)
asm_files = []
for f in files:
    if f[-8:] == "only.txt":
        fl.addFilename(dir + f)

data_files = fl.getData(type=np.dtype('unicode_'))

In [4]:
def construct_corpus(d):
    uni = set()
    docs = []
    for data in d:
        uni.update(data[1])
        docs.append(data[1])
    vocab = sorted(list(uni))
    word_to_id = dict((w, id) for id, w in enumerate(vocab))
    corpus = [[0 for word in doc] for doc in docs]
    for r in range(len(corpus)):
        doc = docs[r]
        for c in range(len(corpus[r])):
            word = doc[c]
            corpus[r][c] = word_to_id[word]

    return corpus, vocab, word_to_id, docs

In [5]:
X, vocab, word_id, docs = construct_corpus(data_files)

In [6]:
n_samples = 500       # no of iterations for the sampler
alpha = .1           # smoothing over level distributions/
gamma = 1.0           # CRP smoothing parameter; number of imaginary customers at next, as yet unused table
eta = 0.1             # smoothing over topic-word distributions
num_levels = 2        # the number of levels in the tree
display_topics = 50   # the number of iterations between printing a brief summary of the topics so far
n_words = 5           # the number of most probable words to print for each topic after model estimation
with_weights = True

In [7]:
hlda = HierarchicalLDA(X, vocab, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels)
hlda.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

HierarchicalLDA sampling
.................................................. 50
topic 0 (level=0, total_words=2663, documents=16): mov (418), lea (312), push (243), callq (236), pop (215), 
    topic 1 (level=1, total_words=1284, documents=14): mov (749), add (133), cltq (61), sar (46), lea (26), 
    topic 4 (level=1, total_words=452, documents=2): mov (257), add (42), sub (23), addl (20), cltq (18), 
.................................................. 100
topic 0 (level=0, total_words=2929, documents=16): mov (609), lea (354), push (249), callq (236), pop (235), 
    topic 1 (level=1, total_words=1064, documents=14): mov (596), add (132), sub (72), cltq (55), cmp (37), 
    topic 4 (level=1, total_words=406, documents=2): mov (219), add (42), sub (28), addl (20), cmp (19), 
.................................................. 150
topic 0 (level=0, total_words=3303, documents=16): mov (744), lea (321), push (269), callq (236), pop (220), 
    topic 1 (level=1, total_words=550, documents=1

In [10]:
colour_map = {
    0: 'blue',
    1: 'black',
    2: 'green'
}

def show_doc(d=0):
    
    node = hlda.document_leaves[d]
    path = []
    while node is not None:
        path.append(node)
        node = node.parent
    path.reverse()   
    
    n_words = 10
    with_weights = False    
    for n in range(len(path)):
        node = path[n]
        colour = colour_map[n] 
        msg = 'Level %d Topic %d: ' % (node.level, node.node_id)
        msg += node.get_top_words(n_words, with_weights)
        output = '<h%d><span style="color:%s">%s</span></h3>' % (n+1, colour, msg)
        display(HTML(output))
        
    display(HTML('<hr/><h5>Processed Document: ' + data_files[d][0].getFullName() +'</h5>'))

    doc = docs[d]
    output = ''
    for n in range(len(doc)):
        w = doc[n]
        l = hlda.levels[d][n]
        colour = colour_map[l]
        output += '<span style="color:%s">%s</span> ' % (colour, w)
    display(HTML(output))

In [9]:
widgets.interact(show_doc, d=(0, len(docs)-1))

interactive(children=(IntSlider(value=0, description='d', max=15), Output()), _dom_classes=('widget-interact',…

<function __main__.show_doc(d=0)>