# PGF for LDA

In [1]:
import algopy
import numpy as np
from algopy import UTPM
from scipy.stats import binom

### Define dummy data

In [2]:
phi = np.array([[0.1, 0.8, 0.1], [0.5, 0.2, 0.3]]) # distribution over words
theta = np.array([0.3, 0.7])                       # distribution over topics
K, V = phi.shape      # K = number of topics, V = size of vocab
N = 4                 # number of tokens in a document

### True single-word marginals
Let $Y_v$ be the count of word type $w_v$ in a document, where $v$ is the word index. Compute the true $P(Y_v = y_v)$ for $y_v = 0, 1, .., N$.

In [3]:
p = theta.dot(phi).reshape(V, 1)
x = [range(N + 1) for _ in range(V)]
true_marginals = binom.pmf(x, N, p) # shape = (V, N + 1)

### PGF single-word marginals

In [4]:
def pgf_marginal(v, y_v, phi, theta): # Compute P(Y_v = y_v) = P(count(w_v) = y_v)
    D = y_v + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, 0] = 1
        
    u = algopy.ones(V, dtype=u_v)
    u[v] = u_v
    t = phi.dot(u)
    s = theta.dot(t)
    h = np.power(s, N)
    return h.data[:, 0][y_v]

# Oberve 3 tokens of word type w_0
y_v, v = 3, 0
print 'PGF marginal:', pgf_marginal(v, y_v, phi, theta)
print 'True marginal:', true_marginals[0, 3]

PGF marginal: 0.13608256
True marginal: 0.13608256


Observe a document of length $N$ with word counts $y = [y_0, ..., y_V]$, where $\sum_{i=0}^V y_i = N$. Find the **single-word** marginal probabilities.

In [5]:
def pgf_marginals(y, phi, theta): # Compute [P(Y_0 = y[0]), ..., P(Y_V = y[V])]
    D = np.max(y) + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, :] = 1
    
    u = algopy.ones((V, V), dtype=u_v)
    np.fill_diagonal(u, u_v)
    t = phi.dot(u)
    s = theta.dot(t)
    h = np.power(s, N)
    return [h_v.data[:, 0][y[i]] for i, h_v in enumerate(h)]

# Observe 2 tokens of w_0, 1 token of w_1, and 1 token of w_2
y = np.array([2, 1, 1])
print 'PGF marginals:', pgf_marginals(y, phi, theta)
print 'True marginals:', true_marginals[np.arange(V), y]

PGF marginals: [0.33304416000000003, 0.36225855999999979, 0.42141696000000001]
True marginals: [ 0.33304416  0.36225856  0.42141696]


In [6]:
# Observe 1 w_0, 0 w_1, and 3 w_2's
y = np.array([1, 0, 3])
print 'PGF marginals:', pgf_marginals(y, phi, theta)
print 'True marginals:', true_marginals[np.arange(V), y]

PGF marginals: [0.36225856000000001, 0.1477633599999999, 0.04202496]
True marginals: [ 0.36225856  0.14776336  0.04202496]


In [7]:
# Observe 0 w_0, 4 w_1's, and 0 w_2
y = np.array([0, 4, 0])
print 'PGF marginals:', pgf_marginals(y, phi, theta)
print 'True marginals:', true_marginals[np.arange(V), y]

PGF marginals: [0.14776336000000001, 0.020851360000000003, 0.33362176000000004]
True marginals: [ 0.14776336  0.02085136  0.33362176]


## With growth

Add growth:
- $n = $ number of tokens in the document (observed)
- $m = \sum_{i=1}^n x_i$, where $x_i \sim log(\rho)$
- $\mathbf{y} \sim mult(m, \boldsymbol{\theta}^T \mathbf{\Phi})$

In [8]:
rho = 0.3 # growth parameter
def pgf_marginal_growth(v, y_v, phi, theta, rho): # Compute P(Y_v = y_v) = P(count(w_v) = y_v)
    D = y_v + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, 0] = 1
        
    u = algopy.ones(V, dtype=u_v)
    u[v] = u_v
    t = phi.dot(u)
    s = theta.dot(t)
    r = np.log(1 - rho*s) / np.log(1 - rho)
    h = np.power(r, N)
    return h.data[:, 0][y_v]

# Oberve 3 tokens of word type w_0
y_v, v = 3, 0
print 'PGF marginal:', pgf_marginal_growth(v, y_v, phi, theta, rho)
#print 'True marginal:', true_marginals[0, 3]

PGF marginal: 0.184817410707


In [9]:
def pgf_marginals_growth(y, phi, theta, rho): # Compute [P(Y_0 = y[0]), ..., P(Y_V = y[V])]
    D = np.max(y) + 1
    u_v = UTPM(np.zeros((D, 1)))
    if D > 1:
        u_v.data[1, :] = 1
    
    u = algopy.ones((V, V), dtype=u_v)
    np.fill_diagonal(u, u_v)
    t = phi.dot(u)
    s = theta.dot(t)
    r = np.log(1 - rho*s) / np.log(1 - rho)
    h = np.power(r, N)
    return [h_v.data[:, 0][y[i]] for i, h_v in enumerate(h)]

# Observe 3 tokens of w_0, 1 token of w_1, and 1 token of w_2
y = np.array([3, 1, 1])
print 'PGF marginals:', pgf_marginals_growth(y, phi, theta, rho)

PGF marginals: [0.18481741070659657, 0.30168515264769635, 0.399420387354113]
