In [None]:
import numpy as np

In [None]:
d1 = ("dog", "cat", "rat", "ate", "the", "cat")
d2 = ("dog", "chair", "table", "and", "the", "chair")
d3 = ("dog", "cat", "chair", "chased", "the", "cat","bug")
docs = [d1,d2,d3]
#Get the vocabulary
vocab = ['chair', 'chased', 'cat', 'dog', 'ate', 'table', 'the', 'rat', 'and',"bug"]
doc_vectors = list()
for doc in docs:
    d_vector = list()
    for word in doc:
        ind = vocab.index(word)
        d_vector.append(ind)
    doc_vectors.append(d_vector)


<span style="color:green;font-size:xx-large">Dirichlet parameters</span>
<li>Alpha is the parameter for the prior topic distribution within documents</li>
<li>Beta is the parameter for the prior topic distribution within documents</li>
<li>num_passes is the number of times the algorithm will update the topic assignments</li>
<li>num_topics is the number of topics</li>

    

In [None]:
# Initialize hyperparameters in LDA
# alpha,beta  are the dirichlet distribution parameter

alpha = 0.2
beta = 0.001
num_passes = 100
num_topics = 2


<span style="color:green;font-size:xx-large">Data parameters</span>


In [None]:
#Data parameters
v = len(vocab) #size of the vocabulary
num_docs = len(doc_vectors) #number of documents

<span style="color:green;font-size:xx-large">Initialize vectors</span>
<li>word_topic_counts will keep track of the number of occurrences of a word in a topic</li>
<li>doc_topics_counts will keep track of the number of occurrences of a topic in a document</li>
<li>topics_vector keeps the assignment of topics to word-document combinations</li>

In [None]:
#Initialize counting vectors
word_topic_counts = np.zeros((num_topics,v)) #Count the 
topics_vector = [np.zeros(len(l)) for l in doc_vectors]  #Word-Document topics assignment 
doc_topics_counts= np.zeros((num_docs,num_topics))
doc_topics_counts

<span style="color:green;font-size:xx-large">random topic assignment</span>


In [None]:
#Random assignment of topics in topics_vector
np.random.seed(42)
for i in range(len(doc_vectors)):
    topics_vector[i] = np.random.randint(low=0,high=num_topics,size=len(doc_vectors[i]))
topics_vector

<span style="color:green;font-size:xx-large">get word_topic_counts</span>


In [None]:
#Get word topic counts
#Note that this will be a 2x10 matrix num_topics x vocab-length
for j in range(len(topics_vector)):
    doc = topics_vector[j]
    for i in range(len(doc)):
        w_t = topics_vector[j][i]
        word_topic_counts[int(w_t),doc_vectors[j][i]]+=1
        
        
word_topic_counts   
        

<span style="color:green;font-size:xx-large">get document topic counts</span>


In [None]:
#Get document-topic probabilities
for j in range(len(topics_vector)):
    doc = topics_vector[j]
    for i in range(len(doc)):
        topic = topics_vector[j][i]
        doc_topics_counts[j,topic] +=1
doc_topics_counts 




<span style="color:green;font-size:xx-large">Do the LDA</span>
<li>Bit tricky but the basic idea is to update topic assignments using the probabilities</li>

In [None]:
#The LDA
#For each word doc combo, get the initial topic assignment and update it
for p in range(num_passes): 
    for i in range(num_docs):
        for j in range(len(doc_vectors[i])):
            assigned_topic = int(topics_vector[i][j])
            vocab_id = doc_vectors[i][j] 
            
            #While examining this word, remove it from our current topic counts
            #In other words, we won't include this in our probabilities
            doc_topics_counts[i][assigned_topic] -= 1
            word_topic_counts[assigned_topic][vocab_id] -= 1

            #Calculate probabilities
            prob_term1 = np.array([doc_topics_counts[i][col] for col in range(num_topics)]) + alpha
            prob_term1 = prob_term1/(sum(doc_topics_counts[i]) + num_topics*alpha)
            prob_term2 = np.array([word_topic_counts[row][vocab_id] for row in range(num_topics)]) + beta
            prob_term2 = prob_term2/(np.sum(word_topic_counts, axis = 1) + v*beta)
            
            probs = prob_term1 * prob_term2
            probs = probs/sum(probs)
 
           #Update topic assignment using the probabilities found above (this is a probabilistic update)
            
            update_topic_assign = np.random.choice(num_topics,1,list(probs))
            topics_vector[i][j] = update_topic_assign
            
            #Update the counts (we had removed this from the counts, we need to add it back)
            doc_topics_counts[i][assigned_topic] += 1
            word_topic_counts[assigned_topic][vocab_id] +=1
    
    

In [None]:
#theta is the posterior (computed) probability matrix of document topic 
theta = (doc_topics_counts+alpha)
theta_row_sum = np.sum(theta, axis = 1)
theta = theta/theta_row_sum.reshape((num_docs,1))
theta