# Topic Analysis

## LDA

In [1]:
import random
from collections import Counter

#### 데이터

In [2]:
#문서의 집합 documents
documents = [["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]]

#### 변수 선언

In [4]:
#조건부 확률 분포 정의를 위한 준비

#topic의 개수
K = 4

#1. 각 토픽이 각 문서에 할당되는 횟수
#counter로 구성된 list
#각각의 counter는 각 문서를 의미함
document_topic_counts = [Counter() for _ in documents]

#2. 각 단어가 각 토픽에 할당되는 횟수
# 각각의 counter는 각 토픽을 의미함
topic_word_counts = [Counter() for _ in range(K)] 

#3. 각 토픽에 할당되는 총 단어 수
# 각각의 숫자는 각 토픽을 의미함
topic_counts = [0 for _ in range(K)] 

#4. 각 문서에 포함되는 총 단어의 수
# 각각의 숫자는 각 문서를 의미함
document_lengths = [len(d) for d in documents]

#5. 단어 종류의 수
distinct_words = set(word for document in documents for word in document) 
W = len(distinct_words)

#6. 총 문서의 수 
D = len(documents)

#### 새로운 topic 계산하기

def p_topic_given_document(topic, d, alpha=0.1):
    # 문서 d의 모든 단어 가운데 topic에 속하는
    # 단어의 비율 (alpha를 더해 smoothing)
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    # topic에 속한 단어 가운데 word의 비율
    # (beta를 더해 smoothing)
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

def topic_weight(d, word, k):
    # 문서와 문서의 단어가 주어지면
    # k번째 토픽의 weight를 반환
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

In [5]:
def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k) for k in range(K)])

#랜덤으로 생성된 weight로부터 인덱스를 생성함
def sample_from(weights):
     total = sum(weights)
     rnd = total * random.random()       # uniform between 0 and total
     for i, w in enumerate(weights):
         rnd -= w                        # return the smallest i such that
         if rnd <= 0: return i           # sum(weights[:(i+1)]) >= rnd

#### Inference

In [6]:
random.seed(0)

#topic의 개수
K = 4

# 각 단어를 임의의 토픽에 배정
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

# 랜덤 초기화한 상태에서 AB를 구하는 데 필요한 숫자 계산하기
for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

In [7]:
# 조건부 확률 분포를 이용하여 (토픽-단어), (문서-토픽)에 대한 깁스 샘플링 실행하기
for iter in range(1000): 
    for d in range(D): 
        for i, (word, topic) in enumerate(zip(documents[d], 
                                              document_topics[d])): 
 
 
           # remove this word / topic from the counts
           # so that it doesn't influence the weights 
            document_topic_counts[d][topic] -= 1 
            topic_word_counts[topic][word] -= 1 
            topic_counts[topic] -= 1 
            document_lengths[d] -= 1 
 
           # choose a new topic based on the weights 
            new_topic = choose_new_topic(d, word) 
            document_topics[d][i] = new_topic 

 
           # and now add it back to the counts 
            document_topic_counts[d][new_topic] += 1 
            topic_word_counts[new_topic][word] += 1 
            topic_counts[new_topic] += 1 
            document_lengths[d] += 1

In [8]:
#각 토픽에 가장 영향력이 높은 (weight)값이 큰 단어 탐색
for k, word_counts in enumerate(topic_word_counts): 
         for word, count in word_counts.most_common(): 
             if count > 0: print (k, word, count) 

0 Java 3
0 Big Data 3
0 Hadoop 2
0 HBase 1
0 C++ 1
0 Spark 1
0 Storm 1
0 programming languages 1
0 MapReduce 1
0 Cassandra 1
0 deep learning 1
1 HBase 2
1 neural networks 2
1 Postgres 2
1 MongoDB 2
1 machine learning 2
1 Cassandra 1
1 numpy 1
1 decision trees 1
1 deep learning 1
1 databases 1
1 MySQL 1
1 NoSQL 1
1 artificial intelligence 1
1 scipy 1
2 regression 3
2 Python 2
2 R 2
2 libsvm 2
2 scikit-learn 2
2 mathematics 1
2 support vector machines 1
2 Haskell 1
2 Mahout 1
3 statistics 3
3 probability 3
3 Python 2
3 R 2
3 pandas 2
3 statsmodels 2
3 C++ 1
3 artificial intelligence 1
3 theory 1


In [9]:
topic_names = ["Big data and programming languages",
                    "python and statistics",
                    "databases",
                    "machine learning"]

for document, topic_counts in zip(documents, document_topic_counts): 
         print (document) 
         for topic, count in topic_counts.most_common(): 
             if count > 0: 
                 print (topic_names[topic], count)

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
Big data and programming languages 7
['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
python and statistics 5
['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
python and statistics 2
databases 2
machine learning 2
['R', 'Python', 'statistics', 'regression', 'probability']
machine learning 3
databases 2
['machine learning', 'regression', 'decision trees', 'libsvm']
databases 2
python and statistics 2
['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
databases 3
Big data and programming languages 3
['statistics', 'probability', 'mathematics', 'theory']
machine learning 3
databases 1
['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
databases 2
python and statistics 2
['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
python and statistics 3
Big data and programming languages 1
['Hadoop', 'Java', 'MapReduce', 'Big Data']
Big data and p