# PGF for LDA

In [1]:
import algopy
import numpy as np
import numpy.random as rn
from algopy import UTPM
from scipy.stats import binom, multinomial

### Define dummy data

In [2]:
# phi = np.array([[0.1, 0.8, 0.1], [0.5, 0.2, 0.3]]) # distribution over words
# theta = np.array([0.3, 0.7])                       # distribution over topics
# K, V = phi.shape      # K = number of topics, V = size of vocab
# N = 4                 # number of tokens in a document

N = 4   # number of tokens in a document
K = 2   # number of topics
V = 3   # number of unique word types

concen1 = 0.1 # concentration param. for topics.  when 0 < alpha < 1
              # topics are low entropy (i.e., peaked around a single val)
              # when alpha > 1, topics are high entropy
phi = rn.dirichlet(np.ones(V) * concen1, size=K)
assert phi.shape == (K, V)

concen2 = 1.  # concentration param. for document dist over topics
theta = rn.dirichlet(np.ones(K) * concen2)
assert theta.shape == (K,)

### True single-word marginals
Let $Y_v$ be the count of word type $w_v$ in a document, where $v$ is the word index. Compute the true $P(Y_v = y_v)$ for $y_v = 0, 1, .., N$.

In [3]:
p = theta.dot(phi).reshape(V, 1)
x = [range(N + 1) for _ in range(V)]
true_marginals = binom.pmf(x, N, p) # shape = (V, N + 1)

### PGF single-word marginals

In [4]:
def pgf_marginal(v, y_v, phi, theta): # Compute P(Y_v = y_v) = P(count(w_v) = y_v)
    D = y_v + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, 0] = 1
        
    u = algopy.ones(V, dtype=u_v)
    u[v] = u_v
    t = phi.dot(u)
    s = theta.dot(t)
    h = np.power(s, N)
    return h.data[:, 0][y_v]

# Oberve 3 tokens of word type w_0
y_v, v = 3, 0
print 'PGF marginal:', pgf_marginal(v, y_v, phi, theta)
print 'True marginal:', true_marginals[0, 3]

PGF marginal: 0.0746107791518
True marginal: 0.0746107791518


Observe a document of length $N$ with word counts $y = [y_0, ..., y_V]$, where $\sum_{i=0}^V y_i = N$. Find the **single-word** marginal probabilities.

In [5]:
def pgf_marginals(y, phi, theta): # Compute [P(Y_0 = y[0]), ..., P(Y_V = y[V])]
    D = np.max(y) + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, :] = 1
    
    u = algopy.ones((V, V), dtype=u_v)
    np.fill_diagonal(u, u_v)
    t = phi.dot(u)
    s = theta.dot(t)
    h = np.power(s, N)
    return [h_v.data[:, 0][y[i]] for i, h_v in enumerate(h)]

# Observe 2 tokens of w_0, 1 token of w_1, and 1 token of w_2
y = np.array([2, 1, 1])
print 'PGF marginals:', pgf_marginals(y, phi, theta)
print 'True marginals:', true_marginals[np.arange(V), y]

PGF marginals: [0.26305230856783735, 0.25898374829800963, 0.41435837310454937]
True marginals: [ 0.26305231  0.25898375  0.41435837]


In [6]:
# Observe 1 w_0, 0 w_1, and 3 w_2's
y = np.array([1, 0, 3])
print 'PGF marginals:', pgf_marginals(y, phi, theta)
print 'True marginals:', true_marginals[np.arange(V), y]

PGF marginals: [0.4121925534649567, 0.067115937342928714, 0.029462703050911004]
True marginals: [ 0.41219255  0.06711594  0.0294627 ]


In [7]:
# Observe 0 w_0, 4 w_1's, and 0 w_2
y = np.array([0, 4, 0])
print 'PGF marginals:', pgf_marginals(y, phi, theta)
print 'True marginals:', true_marginals[np.arange(V), y]

PGF marginals: [0.24220852982194851, 0.058126356816673816, 0.3884793095762058]
True marginals: [ 0.24220853  0.05812636  0.38847931]


### PGF joint marginals

In [8]:
from pyaudi import gdual_double as gdual
from scipy.misc import factorial

def pgf_joint_marginal(v, w, y_v, y_w, phi, theta): # Compute P(Y_v, Y_w = y_v, y_w)
    # Init gdual objects
    order = y_v + y_w
    u_v = gdual(0, "v", order)
    u_w = gdual(0, "w", order)
    
    K, V = phi.shape
    u = [1] * V
    u[v] = u_v
    u[w] = u_w
    
    t = phi.dot(u)
    s = theta.dot(t)
    h = np.power(s, N)
    
    # Evaluate the derivative
    return h.get_derivative([y_v, y_w])/(factorial(y_v) * factorial(y_w))

y_v, v = 1, 0 # 1 count of word 0
y_w, w = 2, 2 # 2 counts of word 2
print 'PGF marginal:', pgf_joint_marginal(v, w, y_v, y_w, phi, theta)
print 'True marginal:', multinomial.pmf([y_v, N-(y_v+y_w), y_w], n=N, p=p.reshape(-1))

PGF marginal: 0.0779386740741
True marginal: 0.0779386740741


## With growth

Add growth:
- $n = $ number of tokens in the document (observed)
- $m = \sum_{i=1}^n x_i$, where $x_i \sim log(\rho)$
- $\mathbf{y} \sim mult(m, \boldsymbol{\theta}^T \mathbf{\Phi})$

In [9]:
rho = 0.3 # growth parameter
def pgf_marginal_growth(v, y_v, phi, theta, rho): # Compute P(Y_v = y_v) = P(count(w_v) = y_v)
    D = y_v + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, 0] = 1
        
    u = algopy.ones(V, dtype=u_v)
    u[v] = u_v
    t = phi.dot(u)
    s = theta.dot(t)
    r = np.log(1 - rho*s) / np.log(1 - rho)
    h = np.power(r, N)
    return h.data[:, 0][y_v]

# Oberve 3 tokens of word type w_0
y_v, v = 3, 0
print 'PGF marginal:', pgf_marginal_growth(v, y_v, phi, theta, rho)
#print 'True marginal:', true_marginals[0, 3]

PGF marginal: 0.116968349442


In [10]:
def pgf_marginals_growth(y, phi, theta, rho): # Compute [P(Y_0 = y[0]), ..., P(Y_V = y[V])]
    D = np.max(y) + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, :] = 1
    
    u = algopy.ones((V, V), dtype=u_v)
    np.fill_diagonal(u, u_v)
    t = phi.dot(u)
    s = theta.dot(t)
    r = np.log(1 - rho*s) / np.log(1 - rho)
    h = np.power(r, N)
    return [h_v.data[:, 0][y[i]] for i, h_v in enumerate(h)]

# Observe 3 tokens of w_0, 1 token of w_1, and 1 token of w_2
y = np.array([3, 1, 1])
print 'PGF marginals:', pgf_marginals_growth(y, phi, theta, rho)

PGF marginals: [0.11696834944171051, 0.19546944470146346, 0.4039004840569741]
