In [1]:
import random
import pandas as pd
from collections import Counter

# 토픽 수
K=4

In [2]:
documents = [["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]]

In [3]:
# 각 토픽이 각 문서에 할당되는 횟수
# Counter로 구성된 리스트
# 각 Counter는 각 문서를 의미
document_topic_counts = [Counter() for _ in documents]

# 각 단어가 각 토픽에 할당되는 횟수
# Counter로 구성된 리스트
# 각 Counter는 각 토픽을 의미
topic_word_counts = [Counter() for _ in range(K)]

# 각 토픽에 할당되는 총 단어수
# 숫자로 구성된 리스트
# 각각의 숫자는 각 토픽을 의미함
topic_counts = [0 for _ in range(K)]

# 각 문서에 포함되는 총 단어수
# 숫자로 구성된 리스트
# 각각의 숫자는 각 문서를 의미함
document_lengths = list(map(len, documents))



In [4]:
document_lengths

[7, 5, 6, 5, 4, 6, 4, 4, 4, 4, 3, 4, 3, 5, 3]

In [5]:

# 단어 종류의 수
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)
W


36

In [6]:

# 총 문서의 수
D = len(documents)
D

15

In [7]:
def topic_weight(d, word, topic):
    """given a document and a word in that document,
    return the weight for the kth topic"""
    
    def p_topic_given_document(topic, d, alpha=0.1):
        """the fraction of words in document _d_
        that are assigned to _topic_ (plus some smoothing)"""
        return ((document_topic_counts[d][topic] + alpha) / (document_lengths[d] + K * alpha))

    def p_word_given_topic(word, topic, beta=0.1):
        """the fraction of words assigned to _topic_
        that equal _word_ (plus some smoothing)"""
        return ((topic_word_counts[topic][word] + beta) / (topic_counts[topic] + W * beta))
    
    
    return p_word_given_topic(word, topic) * p_topic_given_document(topic, d)

In [8]:
def choose_new_topic(d, word):
    
    def sample_from(weights):
        """returns i with probability weights[i] / sum(weights)"""
        total = sum(weights)
        rnd = total * random.random() # uniform between 0 and total
        for i, p in enumerate(weights):
            rnd -= p # return the smallest i such that
            if rnd <= 0: 
                return i # weights[0] + ... + weights[i] >= rnd
        
    return sample_from([topic_weight(d, word, topic) for topic in range(K)])

In [9]:
document_topics = [[random.randrange(K) for word in document] for document in documents]
document_topics

[[2, 0, 1, 3, 3, 3, 1],
 [1, 0, 2, 1, 3],
 [2, 1, 3, 1, 1, 1],
 [2, 2, 1, 0, 2],
 [1, 3, 0, 2],
 [1, 3, 3, 1, 2, 1],
 [0, 1, 1, 1],
 [1, 2, 3, 0],
 [1, 3, 1, 2],
 [0, 3, 0, 1],
 [3, 2, 0],
 [3, 0, 3, 1],
 [3, 0, 3],
 [2, 0, 1, 1, 0],
 [0, 0, 2]]

In [10]:
for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1 
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

In [11]:
document_topic_counts

[Counter({2: 1, 0: 1, 1: 2, 3: 3}),
 Counter({1: 2, 0: 1, 2: 1, 3: 1}),
 Counter({2: 1, 1: 4, 3: 1}),
 Counter({2: 3, 1: 1, 0: 1}),
 Counter({1: 1, 3: 1, 0: 1, 2: 1}),
 Counter({1: 3, 3: 2, 2: 1}),
 Counter({0: 1, 1: 3}),
 Counter({1: 1, 2: 1, 3: 1, 0: 1}),
 Counter({1: 2, 3: 1, 2: 1}),
 Counter({0: 2, 3: 1, 1: 1}),
 Counter({3: 1, 2: 1, 0: 1}),
 Counter({3: 2, 0: 1, 1: 1}),
 Counter({3: 2, 0: 1}),
 Counter({2: 1, 0: 2, 1: 2}),
 Counter({0: 2, 2: 1})]

In [12]:
for epoch in range(1000): # repetition
    for d in range(D): # each documnet
        for i, (word, topic) in enumerate(zip(documents[d],document_topics[d])):
            
            # gibbs sampling: 특정 하나의 topic assignment z를 제거하고 나머지들(-z)의 조건부 확률  
            
            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1 # 문서별 토픽 갯수
            topic_word_counts[topic][word] -= 1 # 토픽별 단어 갯수
            topic_counts[topic] -= 1 # 토픽별 카운트
            document_lengths[d] -= 1 # 문서별 단어갯수
            
            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            
            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1 # 문서별 토픽 갯수
            topic_word_counts[new_topic][word] += 1 # 토픽별 단어 갯수
            topic_counts[new_topic] += 1 # 토픽별 카운트
            document_lengths[d] += 1 # 문서별 단어갯수

In [13]:
df = pd.DataFrame(columns=['Topic1','Topic2','Topic3','Topic4'], index=['Top'+str(i) for i in range(1,6)])

for k, word_counts in enumerate(topic_word_counts):
    for ix, (word, count) in enumerate(word_counts.most_common(5)): # 각 토픽별로 top 10 단어
            df.loc['Top'+str(ix+1),'Topic'+str(k+1)] = word+'({})'.format(count)

In [14]:
print(df)

                  Topic1               Topic2        Topic3  \
Top1          MongoDB(2)            Python(4)       Java(3)   
Top2            HBase(2)      scikit-learn(2)     Hadoop(2)   
Top3         Postgres(2)       statsmodels(2)   Big Data(2)   
Top4  neural networks(1)            pandas(2)  Cassandra(1)   
Top5    deep learning(1)  machine learning(2)      Spark(1)   

                          Topic4  
Top1               regression(3)  
Top2                        R(3)  
Top3               statistics(3)  
Top4              probability(3)  
Top5  artificial intelligence(2)  
