In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
basedir = '../'
sys.path.append(basedir)

import numpy as np
from numpy.random import RandomState
import pandas as pd
from IPython.display import display

from synth_data import HldaDataGenerator
from hlda.sampler import NCRPNode

Synthetic data test for hierarchical LDA inference.

# 1. Generate Vocab

In [2]:
n_rows = 20
n_cols = 5
vocab_mat = np.zeros((n_rows, n_cols), dtype=np.object)
word_count = 0
for i in range(n_rows):
    for j in range(n_cols):
        vocab_mat[i, j] = 'w%s' % word_count
        word_count += 1
        
print vocab_mat

[['w0' 'w1' 'w2' 'w3' 'w4']
 ['w5' 'w6' 'w7' 'w8' 'w9']
 ['w10' 'w11' 'w12' 'w13' 'w14']
 ['w15' 'w16' 'w17' 'w18' 'w19']
 ['w20' 'w21' 'w22' 'w23' 'w24']
 ['w25' 'w26' 'w27' 'w28' 'w29']
 ['w30' 'w31' 'w32' 'w33' 'w34']
 ['w35' 'w36' 'w37' 'w38' 'w39']
 ['w40' 'w41' 'w42' 'w43' 'w44']
 ['w45' 'w46' 'w47' 'w48' 'w49']
 ['w50' 'w51' 'w52' 'w53' 'w54']
 ['w55' 'w56' 'w57' 'w58' 'w59']
 ['w60' 'w61' 'w62' 'w63' 'w64']
 ['w65' 'w66' 'w67' 'w68' 'w69']
 ['w70' 'w71' 'w72' 'w73' 'w74']
 ['w75' 'w76' 'w77' 'w78' 'w79']
 ['w80' 'w81' 'w82' 'w83' 'w84']
 ['w85' 'w86' 'w87' 'w88' 'w89']
 ['w90' 'w91' 'w92' 'w93' 'w94']
 ['w95' 'w96' 'w97' 'w98' 'w99']]


In [3]:
vocab = vocab_mat.flatten().tolist()
print vocab

['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7', 'w8', 'w9', 'w10', 'w11', 'w12', 'w13', 'w14', 'w15', 'w16', 'w17', 'w18', 'w19', 'w20', 'w21', 'w22', 'w23', 'w24', 'w25', 'w26', 'w27', 'w28', 'w29', 'w30', 'w31', 'w32', 'w33', 'w34', 'w35', 'w36', 'w37', 'w38', 'w39', 'w40', 'w41', 'w42', 'w43', 'w44', 'w45', 'w46', 'w47', 'w48', 'w49', 'w50', 'w51', 'w52', 'w53', 'w54', 'w55', 'w56', 'w57', 'w58', 'w59', 'w60', 'w61', 'w62', 'w63', 'w64', 'w65', 'w66', 'w67', 'w68', 'w69', 'w70', 'w71', 'w72', 'w73', 'w74', 'w75', 'w76', 'w77', 'w78', 'w79', 'w80', 'w81', 'w82', 'w83', 'w84', 'w85', 'w86', 'w87', 'w88', 'w89', 'w90', 'w91', 'w92', 'w93', 'w94', 'w95', 'w96', 'w97', 'w98', 'w99']


# 2. Assign Documents to Tree

In [6]:
NCRPNode.total_nodes = 0
NCRPNode.last_node_id = 0
num_levels = 3
gamma = 1
num_docs = 100

root_node = NCRPNode(num_levels, vocab)
document_path = {}
unique_nodes = set()
unique_nodes.add(root_node)
for d in range(num_docs):

    # populate nodes into the path of this document
    path = np.zeros(num_levels, dtype=np.object)
    path[0] = root_node
    root_node.customers += 1 # always add to the root node first
    for level in range(1, num_levels):
        # at each level, a node is selected by its parent node based on the CRP prior
        parent_node = path[level-1]
        level_node = parent_node.select(gamma)
        level_node.customers += 1
        path[level] = level_node
        unique_nodes.add(level_node)

    # set the leaf node for this document                 
    document_path[d] = path
    
unique_nodes = sorted(unique_nodes, key=lambda x: x.node_id)
print len(unique_nodes)
    
def print_node(node, indent, node_topic):
    out = '    ' * indent
    out += 'node %d (level=%d, documents=%d): ' % (node.node_id, node.level, node.customers)
    if node in node_topic:
        probs, words = node_topic[node]
        out += ' '.join(words)
    print out        
    for child in node.children:
        print_node(child, indent+1, node_topic)        

node_topic = {}
print_node(root_node, 0, node_topic)

10
node 0 (level=0, documents=100): 
    node 1 (level=1, documents=97): 
        node 2 (level=2, documents=42): 
        node 3 (level=2, documents=22): 
        node 4 (level=2, documents=30): 
        node 5 (level=2, documents=2): 
        node 9 (level=2, documents=1): 
    node 6 (level=1, documents=3): 
        node 7 (level=2, documents=2): 
        node 8 (level=2, documents=1): 


# 3. Assign Each Node Along the Tree to a Topic

In [7]:
def get_words(vocab_mat, eta, pos, dim):

    if dim == 'row':
        words = vocab_mat[pos]
    elif dim == 'col':
        words = vocab_mat[:, pos]
    
    k = len(words)
    eta = [eta] * k
    probs = np.random.dirichlet(eta)
    return probs, words
    
pos = 0
eta = 1
probs, words = get_words(vocab_mat, eta, pos, 'col')
print probs
print words
print np.sum(probs)

[ 0.07877895  0.06869983  0.03838261  0.06514887  0.00435754  0.06984148
  0.03655306  0.01200188  0.17818882  0.02428382  0.00138845  0.01088763
  0.01633268  0.05047367  0.03911636  0.0899958   0.0893677   0.0110616
  0.04605451  0.06908474]
['w0' 'w5' 'w10' 'w15' 'w20' 'w25' 'w30' 'w35' 'w40' 'w45' 'w50' 'w55'
 'w60' 'w65' 'w70' 'w75' 'w80' 'w85' 'w90' 'w95']
1.0


In [8]:
node_topic = {}
node_topic[unique_nodes[0]] = get_words(vocab_mat, eta, 0, 'row') 
node_topic[unique_nodes[1]] = get_words(vocab_mat, eta, 1, 'row') 
node_topic[unique_nodes[2]] = get_words(vocab_mat, eta, 2, 'row') 
node_topic[unique_nodes[3]] = get_words(vocab_mat, eta, 3, 'row') 
node_topic[unique_nodes[4]] = get_words(vocab_mat, eta, 4, 'row') 
node_topic[unique_nodes[5]] = get_words(vocab_mat, eta, 5, 'row') 
node_topic[unique_nodes[6]] = get_words(vocab_mat, eta, 6, 'row') 
node_topic[unique_nodes[7]] = get_words(vocab_mat, eta, 7, 'row') 
node_topic[unique_nodes[8]] = get_words(vocab_mat, eta, 8, 'row') 
node_topic[unique_nodes[9]] = get_words(vocab_mat, eta, 9, 'row') 
print len(node_topic)

10


In [9]:
print_node(root_node, 0, node_topic)

node 0 (level=0, documents=100): w0 w1 w2 w3 w4
    node 1 (level=1, documents=97): w5 w6 w7 w8 w9
        node 2 (level=2, documents=42): w10 w11 w12 w13 w14
        node 3 (level=2, documents=22): w15 w16 w17 w18 w19
        node 4 (level=2, documents=30): w20 w21 w22 w23 w24
        node 5 (level=2, documents=2): w25 w26 w27 w28 w29
        node 9 (level=2, documents=1): w45 w46 w47 w48 w49
    node 6 (level=1, documents=3): w30 w31 w32 w33 w34
        node 7 (level=2, documents=2): w35 w36 w37 w38 w39
        node 8 (level=2, documents=1): w40 w41 w42 w43 w44


# 4. Generate Words in a Document Based on Its Path

In [10]:
def generate_document(topics, theta, doc_len):

    # for every word in the vocab for this document
    doc = []
    for n in range(doc_len):

        # sample a new topic index    
        k = np.random.multinomial(1, theta).argmax()

        # sample a new word from the word distribution of topic k
        probs, words = topics[k]
        w = np.random.multinomial(1, probs).argmax()
        doc_word = words[w]

        doc.append(doc_word)

    return doc

In [12]:
corpus = []
# alpha = [2.0, 1.0, 0.5]
alpha = [1.0, 1.0, 1.0]
doc_len = 50
for d in range(num_docs):
    path = document_path[d]
    topics = [node_topic[node] for node in path]
    theta = np.random.mtrand.dirichlet(alpha)
    doc = generate_document(topics, theta, doc_len)
    corpus.append(doc)

In [13]:
import os

outdir = '/Users/joewandy/Dropbox/Analysis/hLDA/data/synthetic/'
for d in range(len(corpus)):
    doc = corpus[d]
    file_name = 'doc_%d.txt' % d
    file_path = os.path.join(outdir, file_name)
    with open(file_path, 'w') as f:
        f.write("%s\n" % ' '.join(doc))

# 5. Run hLDA

In [14]:
print len(vocab), len(corpus), len(corpus[0])

100 100 50


convert corpus words into indices

In [15]:
new_corpus = []
for doc in corpus:
    new_doc = []
    for word in doc:
        word_idx = vocab.index(word)
        new_doc.append(word_idx)
    new_corpus.append(new_doc)

In [16]:
print len(vocab), len(new_corpus)
print corpus[0]
print new_corpus[0]

100 100
['w0', 'w8', 'w13', 'w0', 'w13', 'w10', 'w3', 'w0', 'w0', 'w11', 'w5', 'w11', 'w11', 'w1', 'w11', 'w0', 'w3', 'w0', 'w13', 'w14', 'w3', 'w13', 'w3', 'w3', 'w5', 'w3', 'w3', 'w1', 'w1', 'w0', 'w11', 'w1', 'w3', 'w11', 'w12', 'w3', 'w3', 'w1', 'w3', 'w11', 'w3', 'w5', 'w0', 'w0', 'w12', 'w13', 'w13', 'w8', 'w1', 'w11']
[0, 8, 13, 0, 13, 10, 3, 0, 0, 11, 5, 11, 11, 1, 11, 0, 3, 0, 13, 14, 3, 13, 3, 3, 5, 3, 3, 1, 1, 0, 11, 1, 3, 11, 12, 3, 3, 1, 3, 11, 3, 5, 0, 0, 12, 13, 13, 8, 1, 11]


In [18]:
from hlda.sampler import HierarchicalLDA

In [19]:
print alpha, gamma, eta

[1.0, 1.0, 1.0] 1 1


In [20]:
n_samples = 100
hlda = HierarchicalLDA(new_corpus, vocab, alpha=1, gamma=1.0, eta=1.0, num_levels=3)
hlda.estimate(n_samples, display_topics=10, n_words=10, with_weights=False)

HierarchicalLDA sampling
.......... 10
topic 0 (level=0, total_words=1741, documents=100): w1, w3, w0, w4, w8, w13, w7, w5, w11, w14, 
    topic 1 (level=1, total_words=1581, documents=96): w5, w8, w7, w6, w9, w3, w4, w1, w21, w25, 
        topic 2 (level=2, total_words=750, documents=42): w11, w13, w14, w10, w12, w5, w7, w8, w1, w6, 
        topic 3 (level=2, total_words=429, documents=27): w22, w21, w24, w23, w20, w1, w3, w0, w4, w5, 
        topic 6 (level=2, total_words=300, documents=19): w17, w15, w19, w16, w18, w7, w4, w9, w6, w8, 
        topic 16 (level=2, total_words=30, documents=1): w25, w29, w27, w26, w28, w36, w30, w31, w32, w33, 
        topic 18 (level=2, total_words=28, documents=4): w49, w47, w46, w1, w4, w45, w39, w37, w34, w27, 
        topic 19 (level=2, total_words=32, documents=3): w29, w25, w27, w28, w24, w21, w20, w26, w41, w40, 
    topic 7 (level=1, total_words=46, documents=4): w7, w8, w5, w6, w1, w3, w9, w33, w36, w35, 
        topic 20 (level=2, total_word