In [1]:
import numpy as np
import matplotlib.pyplot as plt
import requests
import re
from bs4 import BeautifulSoup
from scipy.special import digamma
from textblob import TextBlob
from collections import defaultdict, Counter

In [2]:
def get_wikitext_sentences(url):
    response = requests.get(url)
    parsed = BeautifulSoup(response.text, 'html.parser')
    return [tag.get_text().strip() for tag in parsed.select('div.mw-parser-output')[0].find_all('p')]


def postprocess(paragraphs):
    result = []
    wrapper = re.compile('[({\[].{,20}[)}\]]')
    whitespace = re.compile('\s')
    for paragraph in paragraphs:
        processed = wrapper.sub('', paragraph)
        result.append(whitespace.sub(' ', processed))
    return result

# 말뭉치 생성하기

세 가지 서로 다른 주제의 wikipedia ariticle들의 각 문단들을 하나의 document로 간주해서 가장 기본 형태의 LDA를 구현한 후 이 모형이 세 가지 토픽의 구성 단어들을 효과적으로 찾아내는지 확인한다. 세 article들의 제목은 각각 아래와 같다. 간단한 전처리를 위해 모든 소괄호, 대괄호와 그 안에 포함된 문자열을 삭제했고, `\n`과 같이 공백을 표현하는 문자열은 ' '로 일괄 변경했다.

1. Korea

In [3]:
korea = postprocess(get_wikitext_sentences("https://en.wikipedia.org/wiki/Korea"))
print(korea[2])

Korea is a region in East Asia; since 1945 it has been divided into what are now two distinct sovereign states: North Korea (officially the "Democratic People's Republic of Korea") and South Korea (officially the "Republic of Korea"). Korea consists of the Korean Peninsula, Jeju Island, and several minor islands near the peninsula. It is bordered by China to the northwest and Russia to the northeast. It is separated from Japan to the east by the Korea Strait and the Sea of Japan .


2. Aristotle

In [4]:
aristotle = postprocess(get_wikitext_sentences("https://en.wikipedia.org/wiki/Aristotle"))
print(aristotle[2])

Aristotle  Greek: Ἀριστοτέλης Aristotélēs, pronounced ; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece. Taught by Plato, he was the founder of the Lyceum, the Peripatetic school of philosophy, and the Aristotelian tradition. His writings cover many subjects including physics, biology, zoology, metaphysics, logic, ethics, aesthetics, poetry, theatre, music, rhetoric, psychology, linguistics, economics, politics, and government. Aristotle provided a complex synthesis of the various philosophies existing prior to him. It was above all from his teachings that the West inherited its intellectual lexicon, as well as problems and methods of inquiry. As a result, his philosophy has exerted a unique influence on almost every form of knowledge in the West and it continues to be a subject of contemporary philosophical discussion.


3. Google

In [5]:
google = postprocess(get_wikitext_sentences("https://en.wikipedia.org/wiki/Google"))
print(google[2])

Google, LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware. It is considered one of the Big Four technology companies, alongside Amazon, Apple, and Microsoft.


세 부류의 문단들을 하나로 합쳐서 말뭉치를 만들고, 문단들의 순서를 랜덤하게 섞는다.

In [6]:
documents = korea + aristotle + google
documents = list(np.random.choice(documents, len(documents), replace=False))

# 모형 구현하기

In [7]:
len(documents)

322

총 322개의 문서가 있고, 한 iteration마다 전체 문서에서 batchsize만큼의 문서를 샘플링해서 variational parameter들을 업데이트한다. batchsize가 총 문서의 개수와 같으면(즉, 이 예시에서는 322면) 일반적인 LDA와 같은 결과를 얻을 수 있다. 

In [10]:
class OnlineLDA:
    
    def __init__(self, corpus, n_documents, n_topics):
        self.word2idx = defaultdict(lambda: len(self.word2idx))
        self.D = n_documents
        self.K = n_topic
        

In [11]:
def extract_nouns(document):
    """
    Return the list of extracted nouns from input document using hardcoded morpheme analyzer
    (** domain specific : morpheme analyzer **)
    
    input
     - document : str
     
    output
     - nouns found in the input document : list
    """
    return [tag[0] for tag in TextBlob(document).tags if tag[1].startswith('NN')]

In [12]:
# # Proof for randomness of extract_nouns function

# import matplotlib.pyplot as plt
# upper_limit = 10
# size = 10000

# indices = get_batch_indices(size, upper_limit) # generate indices from discrete random uniform of specified size
# result = sorted(Counter(indices).items(), key=lambda x: x[0])
# plt.bar([pair[0] for pair in result], [pair[1] for pair in result])
# plt.show()

In [13]:
def to_indexset(document, word2idx):
    """
    Convert list of nouns into list of indices using the word-index map
    
    input
     - document : str
     - word2idx : defaultdict
     
    output
     - list of indices converted from list of nouns : list
    """
    nouns = extract_nouns(document)
    return [word2idx[word] for word in nouns]

In [14]:
def get_batch_indices(size, D):
    """
    Faster way to generate random numbers from discrete uniform distribution [0, D)
    
    input
     - size : int
     - U : int
    
    output
     - list of random numbers
    """
    return list(map(int, np.random.random((size,)) * D))

In [16]:
def batch_call(reference, D, batchsize, word2idx):
    """
    From corpus or object which can be converted into corpus, do:
     1. select mini-batch
     2. convert this into indexset
    (** domain specific : reference is exhaustive list of paragraphs in this example, but it could be just list of URLs to be scraped **)
    
    input
     - reference : list
     - D, batchsize : int
    
    output
     - coordinates : (document index, word index) of documents in the batch
    """
    coordinates = []
    