# LSA with EM

## Code to generate a simulated document-word matrix

In [6]:
import numpy as np

# set simulation parameters
num_docs = 20
num_words = 100
num_topics = 3

# here we fix the number of words per
# document to a constant number
# could use a randomly generated
# number of each document
num_words_per_doc = 20


# generate a document-word matrix given model parameters
# p: topic distribution for each document
# theta: word distribution for each topic
def generate_mat(num_docs, num_words, num_topics, num_words_per_doc, p, theta):
    # this will store the document word matrix
    mat = np.zeros((num_docs, num_words))
    
    # this will store word-topic-document counts (unobserved) 
    delta = np.zeros((num_docs,num_words,num_topics))
    for d in range(num_docs):
        # generate number of words for each topic
        nwt = np.random.multinomial(num_words_per_doc, p[d,:])
        for t, n in np.ndenumerate(nwt):
            # generate number of occurences for each word
            delta[d,:,t] = np.random.multinomial(n, theta[t[0],:])
            
            # sum word occurrences over topics
            mat[d,:] += delta[d,:,t][0,:]
    return mat, delta
        
# generate a dataset
#    - generate model parameters from a Dirichlet distribution
#    - generates document-word matrix with function 'generate_mat'
def generate_data(num_docs, num_words, num_topics, num_words_per_doc):
    # generate topic distribution for each document
    p = np.zeros((num_docs, num_topics))
    
    # assign extra weight to a specific topic in each document
    # by using a non-uniform alpha parameter to the Dirichlet
    # distribution
    for d in range(num_docs):
        # select which topic will be important
        alpha = 1. * np.ones(num_topics)
        t = d % num_topics
        alpha[t] = 10.
        p[d,:] = np.random.dirichlet(alpha)
        
    # generate word distribution for each topic
    theta = np.zeros((num_topics, num_words))
    alpha = np.ones((num_topics, num_words))
    
    # set some number of useful words (with high probability)
    # for each topic
    n_useful_words = 5 * num_topics
    
    for w in range(n_useful_words):
        t = w % num_topics
        alpha[t,w] = 10.
        
    for t in range(num_topics):
        theta[t,:] = np.random.dirichlet(alpha[t,:])
        
    mat,delta = generate_mat(num_docs, num_words, num_topics, num_words_per_doc, p, theta)
    return mat, delta, p, theta


In [7]:
mat, delta,p, theta = generate_data(num_docs,num_words,num_topics,num_words_per_doc)