# LDA using Gibb's sampling

Op basis van uitleg in http://u.cs.biu.ac.il/~89-680/darling-lda.pdf

Using Gensim functionality

Dus we mogen ervan uitgaan dat we een dictionary hebben en bow vectors voor de documenten.
Het een en ander aan counts komt dan uit gensim. Maar hoe?

Wat hebben we nodig?

* The number of words assigned to topic k in document d: $$n_{d;k}$$
* The number of times word w is assigned to topic k: $$n_{k;w}$$
* The total number of times any word is assigned to topic k: $$n_k$$
* Array z which will contain the current topic assignment for each of the N words in the corpus.


In [1]:
# Generate documents
import numpy as np

length = 100
num_topics = 3
vocabulary = np.array(['zon', 'ijs', 'strand', 'vanille', 'chocola', 'broccoli', 'wortel'])

real_theta = np.array([[0.5, 0.0, 0.5],
                       [0.3, 0.3, 0.4],
                       [0.1, 0.8, 0.1],
                       [0.7, 0.3, 0.0],
                       [0.4, 0.4, 0.2]])
real_phi = np.array([[0.4, 0.2, 0.4, 0.0, 0.0, 0.0, 0.0],
                     [0.0, 0.3, 0.0, 0.35, 0.35, 0.0, 0.0],
                     [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5]])

In [2]:
# generate the corpus
corpus = []

for m, tm in enumerate(real_theta):
    doc = []
    for i in range(length):
        # sample topic
        topic = np.random.multinomial(1, tm)
        topic_index = np.where(topic==1)[0][0]
        word = np.random.multinomial(1, real_phi[topic_index])
        doc.append(vocabulary[np.where(word==1)[0][0]])
    corpus.append(doc)
    
for i, doc in enumerate(corpus):
    print 'document', i+1, doc



document 1 ['wortel', 'wortel', 'zon', 'wortel', 'zon', 'strand', 'zon', 'ijs', 'zon', 'strand', 'wortel', 'strand', 'zon', 'strand', 'wortel', 'broccoli', 'ijs', 'wortel', 'strand', 'ijs', 'strand', 'wortel', 'strand', 'broccoli', 'strand', 'wortel', 'broccoli', 'ijs', 'wortel', 'strand', 'strand', 'broccoli', 'strand', 'broccoli', 'zon', 'wortel', 'wortel', 'strand', 'broccoli', 'broccoli', 'zon', 'broccoli', 'ijs', 'strand', 'zon', 'ijs', 'broccoli', 'zon', 'wortel', 'broccoli', 'strand', 'zon', 'zon', 'ijs', 'broccoli', 'zon', 'wortel', 'strand', 'broccoli', 'wortel', 'wortel', 'wortel', 'broccoli', 'broccoli', 'wortel', 'wortel', 'zon', 'wortel', 'strand', 'broccoli', 'zon', 'broccoli', 'wortel', 'wortel', 'wortel', 'ijs', 'wortel', 'ijs', 'ijs', 'wortel', 'strand', 'broccoli', 'strand', 'wortel', 'wortel', 'wortel', 'broccoli', 'zon', 'wortel', 'wortel', 'strand', 'strand', 'broccoli', 'broccoli', 'ijs', 'zon', 'wortel', 'strand', 'wortel', 'wortel']
document 2 ['broccoli', 'zon'

In [3]:
from gensim import corpora

d = corpora.Dictionary(corpus)

In [4]:
# initialize z
import random
from collections import Counter

z = {}
ndk = np.zeros((len(corpus), len(real_phi)), dtype=np.int)
nkw = np.zeros((len(real_phi), len(vocabulary)), dtype=np.int)
nk = Counter()

for d, doc in enumerate(corpus):
    for i, word in enumerate(doc):
        t = random.randint(0,2)
        z[(d, i)] = t
        ndk[d][t] += 1
        word_index = np.where(vocabulary==word)[0][0]
        nkw[t][word_index] += 1
        nk[t] += 1

print z
print ndk
print np.sum(ndk)
print nkw
print np.sum(nkw)
print nk

{(0, 86): 2, (3, 35): 0, (4, 36): 0, (2, 84): 1, (0, 76): 0, (4, 66): 0, (1, 64): 0, (2, 78): 2, (0, 98): 2, (3, 86): 1, (0, 17): 2, (1, 28): 1, (2, 27): 1, (0, 55): 0, (3, 2): 2, (1, 54): 1, (4, 5): 1, (2, 53): 1, (0, 45): 2, (3, 40): 0, (1, 40): 1, (4, 35): 2, (2, 47): 0, (0, 67): 2, (4, 73): 1, (1, 89): 0, (2, 73): 1, (3, 95): 2, (4, 87): 2, (2, 99): 1, (1, 21): 1, (2, 18): 2, (0, 14): 1, (3, 11): 0, (1, 15): 0, (4, 12): 2, (2, 12): 1, (0, 36): 0, (3, 17): 1, (1, 33): 2, (4, 42): 0, (2, 38): 1, (0, 90): 0, (3, 55): 1, (4, 48): 2, (1, 82): 0, (2, 64): 1, (3, 68): 1, (4, 94): 0, (0, 5): 1, (1, 0): 1, (4, 11): 1, (2, 7): 0, (0, 59): 0, (3, 22): 0, (1, 58): 2, (4, 17): 0, (2, 33): 2, (0, 81): 2, (3, 60): 0, (4, 63): 1, (2, 91): 0, (3, 77): 0, (4, 69): 1, (1, 77): 2, (3, 83): 1, (4, 99): 2, (0, 28): 0, (1, 25): 0, (2, 30): 2, (0, 50): 0, (3, 31): 0, (1, 51): 0, (4, 24): 1, (2, 56): 0, (0, 40): 1, (3, 37): 2, (4, 38): 0, (2, 82): 0, (0, 78): 2, (4, 76): 0, (1, 70): 0, (2, 76): 0, (3, 88):

In [5]:
# iterate

def p_z(alpha, beta, num_topics, num_words, d, topic, word_index):
    #print alpha, beta, num_topics, num_words, d, topic, word_index
    #print ndk[d][topic], nkw[topic][word_index], nk[topic]
    return (ndk[d][topic]*alpha)*(nkw[topic][word_index]+beta)/(nk[topic]+beta*num_words)

def normalize(p):
    minimum = np.min(p)
    maximum = np.max(p)
    
    if minimum < 0:
        return normalize((p - minimum)/(maximum - minimum))
    return p/sum(p)

num_iter = 100

alpha = 0.02
beta = 0.02

theta = np.zeros((num_iter,  len(corpus), num_topics))
phi = np.zeros((num_iter, num_topics, len(vocabulary)))

for t in range(num_iter):
    for d, doc in enumerate(corpus):
        for i, w in enumerate(doc):
            word = (d, i)
            topic = z[word]
                        
            word_index = np.where(vocabulary==w)[0][0]
                
            #if ndk[d][topic] > 0 and nkw[topic][word_index] > 0 and nk[topic] > 0:
            if True:
                ndk[d][topic] -= 1
                nkw[topic][word_index] -= 1
                nk[topic] -= 1
            
                p = [p_z(alpha, beta, len(real_phi), len(vocabulary), d, j, word_index) for j in range(len(real_phi))]
                #print p
                # normalize
                p = normalize(p)
                #print p, sum(p)
            
                to = np.random.multinomial(1, p)
                topic = np.where(to==1)[0][0]
                
                z[word] = topic
            
                ndk[d][topic] += 1
                nkw[topic][word_index] += 1
                nk[topic] += 1
    # calculate theta and phi
    theta[t] = (ndk+float(alpha))/(np.sum(ndk, axis=1, keepdims=True)+num_topics*alpha)
    phi[t] = (nkw+float(beta))/(np.sum(nkw, axis=1, keepdims=True)+len(vocabulary)*beta)
            #print 'new topic', topic
print ndk
print nkw
print nk

[[  0 100   0]
 [ 40  60   0]
 [  0  28  72]
 [100   0   0]
 [ 48  52   0]]
[[39 40 43 29 37  0  0]
 [38 42 40  0  0 60 60]
 [ 0 11  0 25 35  0  1]]
Counter({1: 240, 0: 188, 2: 72})


In [6]:
print 'theta'
print 'found'
print np.mean(theta, axis=0)
print 'real'
print real_theta
print
print 'phi'
print 'found'
print np.mean(phi, axis=0)
print 'real'
print real_phi
print
print 'topics found:'
indexes = np.mean(phi, axis=0) > 0.01
for index in indexes:
    print vocabulary[index]
print
print 'topics real'
indexes = real_phi > 0.01
for index in indexes:
    print vocabulary[index]

theta
found
[[ 0.02118729  0.96851889  0.01029382]
 [ 0.46162303  0.53587847  0.0024985 ]
 [ 0.0089946   0.20987408  0.78113132]
 [ 0.97351589  0.00709574  0.01938837]
 [ 0.57655407  0.35078953  0.07265641]]
real
[[ 0.5  0.   0.5]
 [ 0.3  0.3  0.4]
 [ 0.1  0.8  0.1]
 [ 0.7  0.3  0. ]
 [ 0.4  0.4  0.2]]

phi
found
[[ 0.22029314  0.24122749  0.24135615  0.1311584   0.15851712  0.00306462
   0.00438308]
 [ 0.14848169  0.11374624  0.160112    0.00376607  0.00869966  0.28049288
   0.28470146]
 [ 0.01052358  0.22286347  0.00354163  0.30618113  0.43568009  0.01200256
   0.00920754]]
real
[[ 0.4   0.2   0.4   0.    0.    0.    0.  ]
 [ 0.    0.3   0.    0.35  0.35  0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.5   0.5 ]]

topics found:
['zon' 'ijs' 'strand' 'vanille' 'chocola']
['zon' 'ijs' 'strand' 'broccoli' 'wortel']
['zon' 'ijs' 'vanille' 'chocola' 'broccoli']

topics real
['zon' 'ijs' 'strand']
['ijs' 'vanille' 'chocola']
['broccoli' 'wortel']
