In [1]:
from __future__ import division
import math, random, re
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
import requests

In [2]:
def sample_from(weights): 
    """i를 weights[i] / sum(weights)의 확률로 반환"""
    total = sum(weights)
    rnd = total * random.random()    # 0과 total 사이를 균일하게 선택 (Return the next random floating point number in the range [0.0, 1.0).)
    for i, w in enumerate(weights):
        rnd -= w                     # 밑의 주석의 식을 만족하는 가장 작은 i를 반환
        if rnd <= 0: return i        # weights[0] + ... + weights[i] >= rnd

In [3]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [4]:
K = 4

In [11]:
# 각 토픽이 각 문서에 할당되는 횟수
# Counter로 구성된 list
# 각각의 Counter는 각 문서를 의미함
document_topic_counts = [Counter() for _ in documents]
document_topic_counts

[Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter(),
 Counter()]

In [12]:
# 각 단어가 각 토픽에 할당되는 횟수
# Counter로 구성된 list
topic_word_counts = [Counter() for _ in range(K)]
topic_word_counts

[Counter(), Counter(), Counter(), Counter()]

In [13]:
# 각 토픽에 할당되는 총 단어 수
# 숫자로 구성된 list
# 각각의 숫자는 각 토픽을 의미함
topic_counts = [0 for _ in range(K)]
topic_counts

[0, 0, 0, 0]

In [10]:
# 각 문서에 포함되는 총 단어 수
# 숫자로 구성된 list
# 각각의 숫자는 각 문서를 의미함
document_lengths = list(map(len, documents))
document_lengths

[7, 5, 6, 5, 4, 6, 4, 4, 4, 4, 3, 4, 3, 5, 3]

In [14]:
# 단어 종류 수
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)
distinct_words

{'Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory'}

In [31]:
D = len(documents)

In [21]:
document_topic_counts[3][1]

0

In [15]:
topic_word_counts[2]["nlp"]

0

In [24]:
def p_topic_given_document(topic, d, alpha=0.1):
    """문서 d의 모든 단어 중에서 topic에 속하는
    단어의 비율 (smoothing을 더한 비율)"""
    return ((document_topic_counts[d][topic] + alpha) /
           (document_lengths[d] + K * alpha))

In [25]:
def p_word_given_topic(word, topic, beta=0.1):
    """topic에 속한 단어 중에서 word의 비율 (smoothing을 더한 비율)"""
    return ((topic_word_counts[topic][word] + beta) / 
           (topic_counts[topic] + W * beta))

In [26]:
def topic_weight(d, word, k):
    """문서와 문서의 단어가 주어지면,
    k번째 토픽의 weight를 반환"""
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

In [27]:
def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

In [28]:
# 문서의 각 단어가 임의의 토픽의 것이라고 가정한다.
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]


In [29]:
# 문서의 각 단어가 임의의 토픽의 것이라고 가정한다. -> randrange로 할당
document_topics

[[3, 3, 0, 2, 3, 3, 2],
 [3, 2, 1, 1, 2],
 [1, 0, 2, 1, 2, 0],
 [0, 2, 3, 0, 2],
 [3, 2, 1, 3],
 [3, 2, 0, 0, 0, 3],
 [0, 3, 2, 1],
 [2, 0, 1, 1],
 [1, 1, 3, 0],
 [0, 2, 3, 0],
 [2, 2, 0],
 [2, 1, 2, 3],
 [0, 3, 2],
 [1, 2, 1, 1, 1],
 [0, 2, 3]]

In [32]:
# 문서내 토픽, 토픽내 단어 등장 카운트 하기

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1 # 문서의 토픽 매칭 카운트+1
        topic_word_counts[topic][word] += 1  # 토픽의 단어 매칭 카운트+1
        topic_counts[topic] += 1             # 토픽의 단어 개수+1

In [33]:
# document_topics에 나온 "문서에 매칭되는 토픽 개수"를 세어 놓음
document_topic_counts[0]

# [3, 3, 0, 2, 3, 3, 2] -> 3 4번, 0 1번 등...

Counter({3: 4, 0: 1, 2: 2})

In [34]:
topic_word_counts[1]

Counter({'Cassandra': 1,
         'HBase': 1,
         'Python': 1,
         'numpy': 1,
         'decision trees': 1,
         'theory': 1,
         'Mahout': 1,
         'neural networks': 2,
         'deep learning': 2,
         'databases': 1,
         'Postgres': 1,
         'MySQL': 1,
         'MongoDB': 1})

In [36]:
# 깁슨 샘플링. 계속 돌면 수렴한다.
for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [37]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0: print(k, word, count)

topic_names = ["Big Data and programming languages",
               "Python and statistics",
               "databases",
               "machine learning"]
i=0
for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    print(document_topic_counts[i])
    i = i+1
    for topic, count in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)
    print("")

0 regression 2
0 libsvm 2
0 machine learning 2
0 neural networks 2
0 scikit-learn 1
0 support vector machines 1
0 probability 1
0 Mahout 1
0 mathematics 1
1 Postgres 2
1 MongoDB 2
1 Cassandra 1
1 MySQL 1
1 artificial intelligence 1
1 C++ 1
2 Java 3
2 HBase 3
2 Big Data 3
2 Hadoop 2
2 Cassandra 1
2 C++ 1
2 artificial intelligence 1
2 NoSQL 1
2 Spark 1
2 Storm 1
2 Haskell 1
2 programming languages 1
2 MapReduce 1
2 databases 1
3 Python 4
3 R 4
3 statistics 3
3 probability 2
3 deep learning 2
3 pandas 2
3 statsmodels 2
3 decision trees 1
3 theory 1
3 regression 1
3 scipy 1
3 scikit-learn 1
3 numpy 1
['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
Counter({2: 7, 3: 0, 0: 0, 1: 0})
databases 7

['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
Counter({1: 3, 2: 2, 3: 0, 0: 0})
Python and statistics 3
databases 2

['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Counter({3: 6, 1: 0, 0: 0, 2: 0})
machine learning 6

['R', 'Python', 'statistics'

In [38]:
document_topics

[[2, 2, 2, 2, 2, 2, 2],
 [2, 1, 1, 2, 1],
 [3, 3, 3, 3, 3, 3],
 [3, 3, 3, 3, 3],
 [0, 0, 3, 0],
 [3, 3, 2, 2, 2, 2],
 [3, 0, 0, 3],
 [0, 0, 0, 0],
 [0, 3, 2, 2],
 [2, 2, 2, 2],
 [3, 3, 3],
 [1, 3, 1, 3],
 [3, 3, 3],
 [2, 2, 1, 1, 1],
 [0, 0, 0]]