## LATENT DIRICHLET ALLOCATION

https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d
    
https://www.youtube.com/watch?v=DWJYZq_fQ2A

In [1]:
import random
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

topic1 = {
    'id': 'sports', 
    'words': ['football', 'basketball', 'gol', 'play', 'match', 'space'],
    'weights': [4, 1, 1, 1, 1, 2]
}
topic2 = {
    'id': 'politics', 
    'words': ['president', 'interview', 'twitter', 'television', 'debate', 'space'],
    'weights': [2, 1, 4, 1, 1, 1]
}
topic3 = {
    'id': 'science', 
    'words': ['science', 'molecula', 'debate', 'space'],
    'weights': [1, 1, 1, 1]
}

topics = [topic1, topic2, topic3]


# topic1 = {
#     'id': 'sports', 
#     'words': ['football', 'basketball', 'match'],
#     'weights': [1,1,1]
# }
# topic2 = {
#     'id': 'politics', 
#     'words': ['president', 'interview', 'twitter'],
#     'weights': [1, 1, 1]
# }
# topic3 = {
#     'id': 'science', 
#     'words': ['science', 'molecula', 'space'],
#     'weights': [1, 1, 1]
# }

topics = [topic1, topic2, topic3]


# topic1 = {
#     'id': 'dogs', 
#     'words': ['guau', 'guau2'],
#     'weights': [1, 1]
# }
# topic2 = {
#     'id': 'cats', 
#     'words': ['miaw', 'miaw2'],
#     'weights': [1, 1]
# }

# topics = [topic1, topic2]


In [2]:

def generate_documents(topics, num_docs=100, topics_per_doc=2, words_per_doc=1000):
    documents = []
    for i in range(num_docs):
        # first select several topics
        doc_topics = random.sample(topics, topics_per_doc)

        # asign a weight to each topic of the document
        r = [random.random() for _ in range(0, topics_per_doc)]
        s = sum(r)
        doc_topics_weights = [ i/s for i in r ]
        
        # now sample the topics to generate the document. 
        # NOTE: this is an inefficient version for teaching purposes
        doc_words = [] 
        for _ in range(words_per_doc):
            topic = random.choices(doc_topics, doc_topics_weights)[0]  # select one topic
            doc_words = doc_words + random.choices(topic['words'], topic['weights'], k=1)  # choose one word
        
        documents.append({'id': i, 'words': doc_words, 'topics_weights': [(t[0]['id'], t[1]) for t in zip(doc_topics, doc_topics_weights)]})
        
        words_idx = list(set([word for doc in documents for word in doc['words']]))
        docs_idx = [doc['id'] for doc in documents]
    return documents, docs_idx, words_idx


docs, docs_idx, words_idx = generate_documents(topics, num_docs=100, topics_per_doc=2, words_per_doc=100)
docs

[{'id': 0,
  'words': ['basketball',
   'basketball',
   'football',
   'molecula',
   'match',
   'molecula',
   'football',
   'football',
   'debate',
   'play',
   'debate',
   'basketball',
   'science',
   'play',
   'play',
   'space',
   'space',
   'match',
   'match',
   'football',
   'play',
   'football',
   'science',
   'basketball',
   'match',
   'play',
   'gol',
   'football',
   'basketball',
   'football',
   'debate',
   'debate',
   'space',
   'space',
   'football',
   'space',
   'football',
   'football',
   'football',
   'football',
   'debate',
   'football',
   'gol',
   'space',
   'debate',
   'play',
   'play',
   'football',
   'science',
   'play',
   'football',
   'space',
   'football',
   'space',
   'gol',
   'football',
   'football',
   'basketball',
   'space',
   'basketball',
   'football',
   'debate',
   'debate',
   'play',
   'space',
   'space',
   'football',
   'debate',
   'football',
   'debate',
   'gol',
   'space',
   'football'

In [3]:
def words_vs_topics_matrix(docs, words_idx, ntopics):
    m = np.zeros((len(words_idx), ntopics))
    for doc in docs:
        for a in doc['assignment']:
            word_j = words_idx.index(a['word'])
            m[word_j, a['topic']] += 1
    return m


def docs_vs_topics_matrix(docs, ntopics):
    m = np.zeros((len(docs), ntopics))
    for doc in docs:
        doc_i = docs.index(doc)
        for a in doc['assignment']:
            m[doc_i, a['topic']] += 1
    return m

test_docs = [
    {'id': 0, 'assignment': [{'word': 'A', 'topic': 0}, {'word': 'AA', 'topic': 0}]},
    {'id': 1, 'assignment': [{'word': 'B', 'topic': 1}, {'word': 'BB', 'topic': 1}]},
]

# TEST
words_vs_topics_matrix(test_docs, words_idx=['A', 'AA', 'B', 'BB'], ntopics=2) == np.array([[1,0],[1,0],[0,1],[0,1]])


array([[ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True]])

In [5]:

def prob_w_belongs_t_in_d(words_vs_topics, docs_vs_topics, d, w, t, old_topic, alpha, beta, debug=False):
    # P(w belongs topic) = P(word w | topic t) * P(topic t | doc d)
    # P(w belongs topic) = Proporcion(word w en el topic t) * Proporcion(topic t en el doc d)
    # prob_w_belongs_t = words_vs_topics[w, t]/words_vs_topics[:, t] * docs_vs_topics[d, t]/sum(docs_vs_topic[d, :])
    nwords, ntopics = words_vs_topics.shape
    ndocs, _ = docs_vs_topics.shape
    
    if debug:
        print("words_vs_topics=\n", words_vs_topics, words_vs_topics[w, t], (w,t))
        print("docs_vs_topics=\n", docs_vs_topics, docs_vs_topics[d, t], (d,t))
        print("t=", t)
        print("d=", d)
        print("w=", w)
        print("old_topic=", old_topic)
    
    if old_topic == t:
        # Note: as we are calculating the new topic assignation for w, we need to eliminate the
        # old assignation. That is why we rest -1 in those calculations
        assert words_vs_topics[w, t] > 0
        assert docs_vs_topics[d, t] > 0
        if debug:
            print("REMOVING OLD ASIGNATION")
            print((words_vs_topics[w, t]-1 + beta)/(sum(words_vs_topics[:, t])-1 + beta*nwords))
            print((docs_vs_topics[d, t]-1 + alpha)/(sum(docs_vs_topics[d, :])-1 + alpha*ntopics))
            input()
        return (words_vs_topics[w, t]-1 + beta)/(sum(words_vs_topics[:, t])-1 + beta*nwords) \
               * (docs_vs_topics[d, t]-1 + alpha)/(sum(docs_vs_topics[d, :])-1 + alpha*ntopics)
    else:
        if debug:
            print((words_vs_topics[w, t] + beta)/(sum(words_vs_topics[:, t]) + beta))
            print((docs_vs_topics[d, t] + alpha)/(sum(docs_vs_topics[d, :]) + alpha*ntopics))
            input()
        return (words_vs_topics[w, t] + beta)/(sum(words_vs_topics[:, t]) + beta) \
               * (docs_vs_topics[d, t] + alpha)/(sum(docs_vs_topics[d, :]) + alpha*ntopics)


def dirichlet_allocation(docs, docs_idx, words_idx, ntopics, niter=400, alpha=0.5, beta=0.01, debug=False):
    ndocs, nwords = len(docs_idx), len(words_idx)

    for doc in docs:
        assignment = zip(doc['words'], np.random.randint(0, ntopics, len(doc['words'])))  # get random initial assignment for each word in each document
        doc['assignment'] = [{'word': t[0], 'topic': t[1]} for t in assignment]
    
    for _ in range(niter):

        words_vs_topics = words_vs_topics_matrix(docs, words_idx, ntopics)
        # print(words_vs_topics)

        docs_vs_topics = docs_vs_topics_matrix(docs, ntopics)
        # print(docs_vs_topics)

        for doc in docs:
            doc_i = docs_idx.index(doc['id'])
            for assign in doc['assignment']:
                word = assign['word']
                word_j = words_idx.index(word)
                old_topic = assign['topic']
                prob_by_topic = [0] * ntopics  # unnormalized
                for topic_k in range(ntopics):
                    if debug:
                        print(f"==>> doc {doc_i}, word {word_j}, calculating for topic {topic_k} (old topic was {old_topic})")
                    p = prob_w_belongs_t_in_d(words_vs_topics, docs_vs_topics, doc_i, word_j, topic_k, old_topic=old_topic, alpha=alpha, beta=beta, debug=debug)
                    prob_by_topic[topic_k] = p

                new_topic = random.choices(range(ntopics), prob_by_topic, k=1)[0]
                assign['topic'] = new_topic

    return docs


docs = dirichlet_allocation(docs, docs_idx, words_idx, ntopics=3, niter=100)
# docs

In [6]:
def normalize_counter(c):
    total = sum(c.values(), 0.0)
    for key in c:
        c[key] /= total
    return c

for (i, d) in enumerate(docs):
    print(i, d['topics_weights'])
    topic_list = Counter([a['topic'] for a in d['assignment']])
    a = normalize_counter(Counter(topic_list))
    print(i, sorted(a.items()))

0 [('science', 0.24456755248379008), ('sports', 0.75543244751621)]
0 [(0, 0.01), (1, 0.77), (2, 0.22)]
1 [('sports', 0.35727722282000307), ('politics', 0.6427227771799969)]
1 [(0, 0.62), (1, 0.37), (2, 0.01)]
2 [('sports', 0.542989916612604), ('science', 0.45701008338739607)]
2 [(1, 0.56), (2, 0.44)]
3 [('science', 0.39538758109917665), ('sports', 0.6046124189008233)]
3 [(1, 0.67), (2, 0.33)]
4 [('science', 0.07236994958194534), ('sports', 0.9276300504180547)]
4 [(0, 0.01), (1, 0.92), (2, 0.07)]
5 [('sports', 0.43887033509303514), ('politics', 0.5611296649069649)]
5 [(0, 0.58), (1, 0.42)]
6 [('sports', 0.11407328212216619), ('politics', 0.8859267178778338)]
6 [(0, 0.92), (1, 0.08)]
7 [('science', 0.6406622574014711), ('sports', 0.3593377425985289)]
7 [(1, 0.35), (2, 0.65)]
8 [('science', 0.09343523903964185), ('sports', 0.9065647609603581)]
8 [(1, 0.93), (2, 0.07)]
9 [('sports', 0.4143566061847329), ('science', 0.5856433938152672)]
9 [(1, 0.42), (2, 0.58)]
10 [('science', 0.59362657133

## TODO: this works but needs a speed up