# Week 8-5 토픽 모델링 실습

In [1]:
#!pip install python-Levenshtein # Warning을 피하기 위해서 설치

In [2]:
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from gensim import corpora
import gensim
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('[\w]+')

stop_words = stopwords.words('english')

p_stemmer = PorterStemmer()

In [3]:
# 예제 문서 만들기

doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."
doc_f = "Big data is a term used to refer to data sets that are too large or complex for traditional data-processing application software to adequately deal with."
doc_g = "Data with many cases offer greater statistical power, while data with higher complexity may lead to a higher false discovery rate"
doc_h = "Big data was originally associated with three key concepts: volume, variety, and velocity."
doc_i = "A 2016 definition states that 'Big data represents the information assets characterized by such a high volume, velocity and variety to require specific technology and analytical methods for its transformation into value'."
doc_j = "Data must be processed with advanced tools to reveal meaningful information."

doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e, doc_f, doc_g, doc_h, doc_i, doc_j]

In [4]:
# 데이터 전처리

texts = []

for w in doc_set:
    raw = w.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in stop_words]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

In [5]:
# 단어 사전 생성과 Bag of Words  변환

dictionary = corpora.Dictionary(texts)  # 단어 사전 생성
print(dictionary.token2id)

print(dictionary.doc2bow(texts[0]))

corpus = [dictionary.doc2bow(text) for text in texts]

{'brocolli': 0, 'brother': 1, 'eat': 2, 'good': 3, 'like': 4, 'mother': 5, 'around': 6, 'basebal': 7, 'drive': 8, 'lot': 9, 'practic': 10, 'spend': 11, 'time': 12, 'blood': 13, 'caus': 14, 'expert': 15, 'health': 16, 'increas': 17, 'may': 18, 'pressur': 19, 'suggest': 20, 'tension': 21, 'better': 22, 'feel': 23, 'never': 24, 'often': 25, 'perform': 26, 'school': 27, 'seem': 28, 'well': 29, 'profession': 30, 'say': 31, 'adequ': 32, 'applic': 33, 'big': 34, 'complex': 35, 'data': 36, 'deal': 37, 'larg': 38, 'process': 39, 'refer': 40, 'set': 41, 'softwar': 42, 'term': 43, 'tradit': 44, 'use': 45, 'case': 46, 'discoveri': 47, 'fals': 48, 'greater': 49, 'higher': 50, 'lead': 51, 'mani': 52, 'offer': 53, 'power': 54, 'rate': 55, 'statist': 56, 'associ': 57, 'concept': 58, 'key': 59, 'origin': 60, 'three': 61, 'varieti': 62, 'veloc': 63, 'volum': 64, '2016': 65, 'analyt': 66, 'asset': 67, 'character': 68, 'definit': 69, 'high': 70, 'inform': 71, 'method': 72, 'repres': 73, 'requir': 74, 'spe

In [6]:
# LDA 모델 생성

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary)

ldamodel.print_topics(num_words=5)

[(0,
  '0.071*"health" + 0.032*"may" + 0.029*"brocolli" + 0.029*"pressur" + 0.029*"drive"'),
 (1,
  '0.069*"data" + 0.034*"good" + 0.033*"eat" + 0.033*"brocolli" + 0.032*"process"'),
 (2,
  '0.044*"data" + 0.027*"mother" + 0.027*"brother" + 0.027*"drive" + 0.025*"higher"')]

In [7]:
ldamodel.get_document_topics(corpus)[0]

[(0, 0.03525039), (1, 0.9291788), (2, 0.035570867)]

In [None]:
#!pip install pyLDAvis

In [8]:
# 토픽 가시화
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

prepared_data = gensimvis.prepare(ldamodel, corpus, dictionary)
prepared_data

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps
