## Latent Dirichlet Allocation
This is a demostration of LDA topic model using Gibbs sampling on a "perfect dataset"   
Thanks to the clear [tutorial](https://www.cnblogs.com/pinard/p/6831308.html) provided by Pinard Liu  
Author: kUNQI jIANG   
Date: 2019/1/22  

### Corpus generation
As Gibbs sampling in LDA essentially based on bag-of-words so the order of words does not matter, I use completely seperated wordset of different topic to generate pure topic documents as corpus. This is the extreme case where words and topics will be completely clustered after LDA as we can see in the result. While in real word, a word exist in different topics, and a document can cover multi-topics.

In [1]:
food_set = ["broccoli","banana","spinach","smoothie","breakfast","ham","cream","eat","vegetable","dinner","lunch",
            "apple","peach","pork","beef","rice","noodle","chicken","KFC","restaurant","cream","tea","pan","beacon"]
animal_set = ["dog","cat","fish","chinchilla","kitten","cute","hamster","munching","bird","elephant","monkey","zoo",
              "zoology","pig","piggy","duck","mice","micky","tiger","lion","horse","dragon","panda","bee","rabbit"]
soccer_set = ["football","pitch","play","player","cup","ballon","messi","ronald","manU","liverpool","chelase","ozil",
              "practice","hard","dream","stadium","fast","speed","strong","move","shot","attack","defense","win"]

In [2]:
import numpy as np
def generate(topic_set):
    sent = np.random.choice(topic_set,10)
    return " ".join(sent)

In [3]:
topics_set = [food_set,animal_set,soccer_set]
corpus = []
for i in range(100):
    corpus.append(generate(topics_set[0]).split())
    corpus.append(generate(topics_set[1]).split())
    corpus.append(generate(topics_set[2]).split())

In [4]:
import numpy as np

all_words = [word for document in corpus for word in document]
vocab = set(all_words)
num_docs = len(corpus)
num_words = len(vocab)
word2id = {w:i for i,w in enumerate(vocab)}
id2word = {i:w for i,w in enumerate(vocab)}

In [5]:
# model 3 latent topics 
num_topics = 3
# Dirichlet prior
alpha = np.ones([num_topics])
#ita = term_freq
ita = 0.1 * np.ones([num_words])

### Random assignment
At the start randomly assign topic to each word in each document

In [6]:
topic_assignments = []
docs_topics = np.zeros([num_docs,num_topics]) # counts of topic assignments of each word in each doc
words_topics = np.zeros([num_words,num_topics]) # counts of topic distributes of each word over all doc
topics_words = np.zeros([num_topics,num_words]) # counts of word distributes of each topic over all doc

for d,document in enumerate(corpus):
    theta = np.random.dirichlet(alpha, 1)[0]
    doc_topics = []
    for n,word in enumerate(document):
        sample = np.random.multinomial(1, theta, size=1)[0]
        topic = list(sample).index(1)
        doc_topics.append(topic)
        docs_topics[d,topic] += 1
        words_topics[word2id[word],topic] += 1
        topics_words[topic,word2id[word]] += 1
    topic_assignments.append(doc_topics)
    

### Gibbs Sampling

In [7]:
def Gibbs_sampling(d,word_id,words_topics,docs_topics,topics_words,alpha,ita):
    topic_probs = (docs_topics[d] + alpha) / np.sum(docs_topics[d] + alpha)
    word_sum = np.sum(topics_words + ita, axis = 1)
    word_probs = (words_topics[word_id] + ita[word_id]) / word_sum
    # posterior probs
    probs = topic_probs * word_probs
    # normalize
    sample_probs = probs / np.sum(probs)
    #print(sample_probs)
    # sample new topic for current word
    new_topic = list(np.random.multinomial(1, sample_probs, size=1)[0]).index(1)
    return new_topic

In [8]:
num_iterations = 15
for j in range(num_iterations):
    for d in range(len(corpus)):
        document = corpus[d]
        for n in range(len(document)):
            word = document[n]
            word_id = word2id[word]
            topic = topic_assignments[d][n]
            # exclude current word and topic
            docs_topics[d][topic] -= 1
            topics_words[topic][word_id] -=1
            words_topics[word_id,topic] -= 1
            new_topic = Gibbs_sampling(d,word_id,words_topics,docs_topics,topics_words,alpha,ita)
            # update topic and word state
            docs_topics[d][new_topic] += 1
            topics_words[new_topic][word_id] += 1
            words_topics[word_id,new_topic] += 1
            topic_assignments[d][n] = new_topic

### Evaluation

In [9]:
# all seperated
docs_topics[:10]

array([[10.,  0.,  0.],
       [ 0., 10.,  0.],
       [ 0.,  0., 10.],
       [10.,  0.,  0.],
       [ 0., 10.,  0.],
       [ 0.,  0., 10.],
       [10.,  0.,  0.],
       [ 0., 10.,  0.],
       [ 0.,  0., 10.],
       [10.,  0.,  0.]])

In [10]:
for i,state in enumerate(topics_words):
    # sorted descending word frequence within each topic
    topic_id_freq = sorted(range(len(state)), key=lambda k: state[k], reverse=True)
    topic_word_freq = [id2word[i] for i in topic_id_freq]
    print("Topic: ", i)
    print(topic_word_freq)

Topic:  0
['cream', 'chicken', 'peach', 'restaurant', 'breakfast', 'lunch', 'broccoli', 'smoothie', 'banana', 'KFC', 'noodle', 'eat', 'dinner', 'vegetable', 'beacon', 'spinach', 'pan', 'tea', 'pork', 'beef', 'apple', 'rice', 'ham', 'liverpool', 'bird', 'defense', 'ronald', 'rabbit', 'stadium', 'fish', 'ballon', 'fast', 'move', 'cat', 'speed', 'dog', 'micky', 'win', 'monkey', 'chinchilla', 'kitten', 'attack', 'piggy', 'horse', 'zoology', 'duck', 'football', 'mice', 'tiger', 'munching', 'player', 'play', 'zoo', 'ozil', 'messi', 'pitch', 'chelase', 'cup', 'hamster', 'cute', 'shot', 'dream', 'dragon', 'bee', 'lion', 'practice', 'hard', 'elephant', 'manU', 'pig', 'strong', 'panda']
Topic:  1
['hamster', 'fish', 'panda', 'micky', 'monkey', 'zoology', 'cute', 'rabbit', 'horse', 'lion', 'elephant', 'chinchilla', 'mice', 'tiger', 'dragon', 'kitten', 'bee', 'pig', 'cat', 'bird', 'piggy', 'zoo', 'dog', 'munching', 'duck', 'banana', 'liverpool', 'breakfast', 'peach', 'defense', 'vegetable', 'ronal

In [11]:
topics_words

array([[46.,  0., 48.,  0., 55.,  0., 39.,  0.,  0.,  0., 47., 29.,  0.,
         0.,  0.,  0.,  0.,  0., 48.,  0.,  0., 59.,  0.,  0.,  0.,  0.,
         0., 36.,  0., 43.,  0.,  0., 92.,  0., 32.,  0.,  0.,  0.,  0.,
         0., 35.,  0.,  0.,  0.,  0., 47., 41.,  0.,  0.,  0.,  0., 40.,
        37.,  0.,  0.,  0., 29., 29., 38.,  0.,  0.,  0., 33.,  0., 43.,
         0., 53.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0., 34.,  0.,  0.,  0.,  0., 42.,  0.,  0.,  0., 50.,
         0.,  0.,  0., 35.,  0.,  0., 33., 48.,  0.,  0., 46., 40., 36.,
         0.,  0., 34.,  0., 42., 46.,  0., 30.,  0.,  0., 38., 38., 32.,
         0.,  0.,  0., 34.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 53.,  0.,
         0., 46.,  0.,  0.,  0.,  0.,  0., 37., 36., 42.,  0.,  0.,  0.,
         0.,  0., 42.,  0., 36.,  0., 50.],
       [ 0., 39.,  0.,  0.,  0., 45.,  0., 37.,  0., 43.,  0.,  0.,  0.,
        36., 38., 26.,  0., 45.,  0.,  0.,  0.,  0., 46.,  0.,  0.,  0.,
        32.,  1.,  0.,  0.,  0.,  0.

In [12]:
for i in range(len(words_topics)):
    print(words_topics[i],id2word[i])

[46.  0.  0.] banana
[ 0.  0. 39.] liverpool
[48.  0.  0.] breakfast
[ 0. 34.  0.] bird
[55.  0.  0.] peach
[ 0.  0. 45.] defense
[39.  0.  0.] vegetable
[ 0.  0. 37.] ronald
[ 0. 42.  0.] rabbit
[ 0.  0. 43.] stadium
[47.  0.  0.] broccoli
[29.  0.  0.] apple
[ 0. 50.  0.] fish
[ 0.  0. 36.] ballon
[ 0.  0. 38.] fast
[ 0.  0. 26.] move
[ 0. 35.  0.] cat
[ 0.  0. 45.] speed
[48.  0.  0.] lunch
[ 0. 33.  0.] dog
[ 0. 48.  0.] micky
[59.  0.  0.] chicken
[ 0.  0. 46.] win
[ 0. 46.  0.] monkey
[ 0. 40.  0.] chinchilla
[ 0. 36.  0.] kitten
[ 0.  0. 32.] attack
[36.  0.  1.] pan
[ 0. 34.  0.] piggy
[43.  0.  0.] KFC
[ 0. 42.  0.] horse
[ 0. 46.  0.] zoology
[92.  0.  0.] cream
[ 0. 30.  0.] duck
[32.  0.  0.] beef
[ 0.  0. 47.] football
[ 0. 38.  0.] mice
[ 0. 38.  0.] tiger
[ 0. 32.  0.] munching
[ 0.  0. 40.] player
[35.  0.  0.] tea
[ 0.  0. 34.] play
[ 0. 34.  0.] zoo
[ 0.  0. 41.] ozil
[ 0.  0. 45.] messi
[47.  0.  0.] smoothie
[41.  0.  0.] eat
[ 0.  0. 45.] pitch
[ 0.  0. 56.] chelas

### Comparison
Justify my result with gensim LDA model

In [13]:
import gensim
from gensim import corpora
text_data = corpus
dictionary = corpora.Dictionary(text_data)
id_corpus = [dictionary.doc2bow(text) for text in text_data]

ldamodel = gensim.models.ldamodel.LdaModel(id_corpus, num_topics = num_topics, id2word=dictionary, passes=12)
#ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=num_words)
for topic in topics:
    print(topic)

(0, '0.052*"hamster" + 0.049*"panda" + 0.049*"fish" + 0.047*"micky" + 0.045*"zoology" + 0.045*"cute" + 0.045*"monkey" + 0.041*"lion" + 0.041*"horse" + 0.041*"rabbit" + 0.041*"elephant" + 0.039*"chinchilla" + 0.037*"mice" + 0.037*"tiger" + 0.036*"dragon" + 0.035*"pig" + 0.035*"kitten" + 0.035*"bee" + 0.034*"cat" + 0.033*"bird" + 0.033*"zoo" + 0.033*"piggy" + 0.032*"dog" + 0.032*"munching" + 0.030*"duck" + 0.001*"practice" + 0.000*"manU" + 0.000*"cream" + 0.000*"ballon" + 0.000*"attack" + 0.000*"speed" + 0.000*"shot" + 0.000*"defense" + 0.000*"dream" + 0.000*"ozil" + 0.000*"liverpool" + 0.000*"football" + 0.000*"chelase" + 0.000*"play" + 0.000*"player" + 0.000*"dinner" + 0.000*"breakfast" + 0.000*"pitch" + 0.000*"chicken" + 0.000*"KFC" + 0.000*"cup" + 0.000*"hard" + 0.000*"peach" + 0.000*"beef" + 0.000*"eat" + 0.000*"apple" + 0.000*"tea" + 0.000*"move" + 0.000*"beacon" + 0.000*"stadium" + 0.000*"ronald" + 0.000*"pan" + 0.000*"win" + 0.000*"strong" + 0.000*"smoothie" + 0.000*"restaurant" 

## Inference
As we have had trained topics-words distributes, we only need to use them 
to Gibbs sample the topic for each word in the test documents until converge. 

In [14]:
test_corpus = [["ozil","panda","pig","ballon","attack","eat","dragon","ronald","micky","dinner","bird","messi"],
               ["vegetable","liverpool","mice","chelase","speed","horse","rice","peach","noodle","beacon","bee"],
               ["defense","win","hard","rabbit","player","strong","lion","zoo","pig","cat","player","manU","shot"]]

In [15]:
# firstly random assign topic to each word in test docs
test_num_docs = len(test_corpus)
test_topic_assignments = []
test_docs_topics = np.zeros([test_num_docs,num_topics]) # counts of topic assignments of each word in each doc

for d,document in enumerate(test_corpus):
    theta = np.random.dirichlet(alpha, 1)[0]
    test_doc_topics = []
    for n,word in enumerate(document):
        sample = np.random.multinomial(1, theta, size=1)[0]
        topic = list(sample).index(1)
        test_doc_topics.append(topic)
        test_docs_topics[d,topic] += 1

    test_topic_assignments.append(test_doc_topics)
    

In [16]:
test_topic_assignments

[[2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
 [1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0],
 [1, 2, 0, 0, 2, 2, 0, 0, 1, 2, 2, 0, 0]]

In [17]:
num_iterations = 20
for j in range(num_iterations):
    for d in range(len(test_corpus)):
        document = test_corpus[d]
        for n in range(len(document)):
            word = document[n]
            word_id = word2id[word]
            topic = test_topic_assignments[d][n]
            # exclude current word and topic
            test_docs_topics[d][topic] -= 1
            new_topic = Gibbs_sampling(d,word_id,words_topics,test_docs_topics,topics_words,alpha,ita)
            # update topic and word state
            test_docs_topics[d][new_topic] += 1
            test_topic_assignments[d][n] = new_topic

In [18]:
test_topic_assignments

[[2, 1, 1, 2, 2, 0, 1, 2, 1, 0, 1, 2],
 [0, 2, 1, 2, 2, 1, 0, 0, 0, 0, 1],
 [2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2]]

In [19]:
for i in range(test_num_docs):
    s_doc = "test_doc_"+str(i)+": "
    topic_dis = test_docs_topics[i] / np.sum(test_docs_topics[i])
    s_dis = "topic_0:{0:.2f}, topic_1:{1:.2f}, topic_2:{2:.2f}".format(topic_dis[0],topic_dis[1],topic_dis[2])
    print(s_doc + s_dis)

test_doc_0: topic_0:0.17, topic_1:0.42, topic_2:0.42
test_doc_1: topic_0:0.45, topic_1:0.27, topic_2:0.27
test_doc_2: topic_0:0.00, topic_1:0.38, topic_2:0.62
