## 예제 1 로이터 뉴스 데이터 (https://pypi.org/project/lda/)

In [40]:
!pip install lda



In [41]:
import numpy as np
import lda
import lda.datasets
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
X.shape

(395, 4258)

In [42]:
X

array([[1, 0, 1, ..., 0, 0, 0],
       [7, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]], dtype=int32)

In [43]:
vocab

('church',
 'pope',
 'years',
 'people',
 'mother',
 'last',
 'told',
 'first',
 'world',
 'year',
 'president',
 'teresa',
 'charles',
 'catholic',
 'during',
 'life',
 'u.s',
 'city',
 'public',
 'time',
 'since',
 'family',
 'king',
 'former',
 'british',
 'harriman',
 'against',
 'country',
 'vatican',
 'made',
 'three',
 'hospital',
 'minister',
 'home',
 'died',
 'tuesday',
 'government',
 "n't",
 'million',
 'prince',
 'john',
 'very',
 'war',
 'say',
 'order',
 'day',
 'political',
 'leader',
 'heart',
 'roman',
 'yeltsin',
 'later',
 'france',
 'party',
 'monday',
 'death',
 'clinton',
 'sunday',
 'elvis',
 'state',
 'diana',
 'royal',
 'wednesday',
 'churchill',
 'official',
 'service',
 'including',
 'film',
 'while',
 'week',
 'left',
 'work',
 'paris',
 'around',
 'american',
 'saying',
 'take',
 'doctors',
 'group',
 'son',
 'queen',
 'house',
 'police',
 'national',
 'under',
 'surgery',
 'next',
 'peace',
 'paul',
 'long',
 'ceremony',
 'month',
 'thursday',
 'french',


In [44]:
titles

('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20',
 '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21',
 "2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23",
 '3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25',
 '4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25',
 "5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25",
 '6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26',
 "7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25",
 '8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26',
 '9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26',
 '10 UK: Britain tells Charles to forget Camilla. LONDON 1996-08-27',
 "11 COTE D'IVOIRE: FEATURE - Quiet homecoming for reprieved Ivory Coast maid. ABIDJAN 1996-08-28",


In [45]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1051748
INFO:lda:<10> log likelihood: -719800
INFO:lda:<20> log likelihood: -699115
INFO:lda:<30> log likelihood: -689370
INFO:lda:<40> log likelihood: -684918
INFO:lda:<50> log likelihood: -681322
INFO:lda:<60> log likelihood: -678979
INFO:lda:<70> log likelihood: -676598
INFO:lda:<80> log likelihood: -675383
INFO:lda:<90> log likelihood: -673316
INFO:lda:<100> log likelihood: -672761
INFO:lda:<110> log likelihood: -671320
INFO:lda:<120> log likelihood: -669744
INFO:lda:<130> log likelihood: -669292
INFO:lda:<140> log likelihood: -667940
INFO:lda:<150> log likelihood: -668038
INFO:lda:<160> log likelihood: -667429
INFO:lda:<170> log likelihood: -666475
INFO:lda:<180> log likelihood: -665562
INFO:lda:<190> log likelihood: -664920
INFO:lda:<200> log likelihood: -664979
INFO:lda:<210> log likelihood: -664722
INFO:lda:<220> log likelihood: -

Topic 0: british churchill sale million major letters west britain
Topic 1: church government political country state people party against
Topic 2: elvis king fans presley life concert young death
Topic 3: yeltsin russian russia president kremlin moscow michael operation
Topic 4: pope vatican paul john surgery hospital pontiff rome
Topic 5: family funeral police miami versace cunanan city service
Topic 6: simpson former years court president wife south church
Topic 7: order mother successor election nuns church nirmala head
Topic 8: charles prince diana royal king queen parker bowles
Topic 9: film french france against bardot paris poster animal
Topic 10: germany german war nazi letter christian book jews
Topic 11: east peace prize award timor quebec belo leader
Topic 12: n't life show told very love television father
Topic 13: years year time last church world people say
Topic 14: mother teresa heart calcutta charity nun hospital missionaries
Topic 15: city salonika capital buddhist c

## 예제 2 뉴스그룹(LSA 동일 예제)

In [46]:
!pip install stopwords



In [47]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

dataset.target_names # target_name에는 카테고리가 표시

news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [48]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [50]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(60633 unique tokens: ['acts', 'atrocities', 'austria', 'away', 'biased']...)
INFO:gensim.corpora.dictionary:built Dictionary(64281 unique tokens: ['acts', 'atrocities', 'austria', 'away', 'biased']...) from 11314 documents (total 955004 corpus positions)


In [51]:
import gensim
NUM_TOPICS = 20 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.05
INFO:gensim.models.ldamodel:using symmetric eta at 0.05
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 20 topics, 15 passes over the supplied corpus of 11314 documents, updating model once every 2000 documents, evaluating perplexity every 11314 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #2000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #1 (0.050): 0.005*"people" + 0.004*"could" + 0.003*"would" + 0.003*"also" + 0.003*"time" + 0.003*"work" + 0.002*"think" + 0.002*"like" + 0.002*"well" + 0.002*"going"
INFO:gensim.models.ldamodel:topic #14 (0.050): 0.005*"like" + 0.004*"would" + 0.004*"system" + 0.004*"hard" + 0.004*"people" + 0.004*"disk" + 0.003*"also" + 0.003*"drive" + 0.0

INFO:gensim.models.ldamodel:PROGRESS: pass 1, at document #2000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #7 (0.050): 0.013*"would" + 0.011*"like" + 0.009*"thanks" + 0.009*"card" + 0.009*"know" + 0.009*"problem" + 0.009*"scsi" + 0.008*"anyone" + 0.006*"please" + 0.006*"windows"
INFO:gensim.models.ldamodel:topic #13 (0.050): 0.013*"space" + 0.006*"window" + 0.005*"also" + 0.005*"like" + 0.005*"widget" + 0.004*"would" + 0.004*"science" + 0.004*"nasa" + 0.004*"program" + 0.004*"orbit"
INFO:gensim.models.ldamodel:topic #19 (0.050): 0.009*"game" + 0.008*"team" + 0.008*"year" + 0.007*"good" + 0.006*"would" + 0.006*"like" + 0.006*"last" + 0.005*"think" + 0.005*"first" + 0.005*"play"
INFO:gensim.models.ldamodel:topic #15 (0.050): 0.017*"israel" + 0.010*"israeli" + 0.008*"would" + 0.007*"jews" + 0.007*"arab" + 0.005*"pain" + 0.005*"people" + 0.004*"like" + 0.004*"time" + 0.004*"even"
INFO:gensim.models

INFO:gensim.models.ldamodel:topic #13 (0.050): 0.019*"space" + 0.007*"nasa" + 0.005*"earth" + 0.005*"launch" + 0.005*"science" + 0.005*"moon" + 0.005*"orbit" + 0.004*"also" + 0.004*"window" + 0.004*"lunar"
INFO:gensim.models.ldamodel:topic #4 (0.050): 0.012*"people" + 0.007*"would" + 0.007*"said" + 0.006*"government" + 0.005*"armenian" + 0.005*"armenians" + 0.004*"know" + 0.004*"well" + 0.004*"turkish" + 0.004*"right"
INFO:gensim.models.ldamodel:topic #18 (0.050): 0.008*"power" + 0.007*"price" + 0.007*"back" + 0.006*"sale" + 0.005*"like" + 0.005*"good" + 0.005*"around" + 0.005*"condition" + 0.005*"shipping" + 0.005*"miles"
INFO:gensim.models.ldamodel:topic diff=0.138264, rho=0.339873
INFO:gensim.models.ldamodel:PROGRESS: pass 2, at document #4000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #15 (0.050): 0.023*"israel" + 0.015*"jews" + 0.014*"israeli" + 0.008*"arab" + 0.007*"would" + 0.006*"pain" 

INFO:gensim.models.ldamodel:topic #18 (0.050): 0.009*"power" + 0.008*"back" + 0.007*"price" + 0.007*"good" + 0.007*"sale" + 0.006*"cars" + 0.005*"like" + 0.005*"condition" + 0.005*"shipping" + 0.005*"around"
INFO:gensim.models.ldamodel:topic diff=0.090647, rho=0.321795
INFO:gensim.models.ldamodel:PROGRESS: pass 3, at document #4000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #8 (0.050): 0.022*"medical" + 0.014*"disease" + 0.014*"health" + 0.013*"cancer" + 0.013*"patients" + 0.012*"aids" + 0.010*"doctor" + 0.008*"treatment" + 0.008*"medicine" + 0.007*"drug"
INFO:gensim.models.ldamodel:topic #16 (0.050): 0.018*"file" + 0.013*"windows" + 0.012*"available" + 0.011*"files" + 0.010*"version" + 0.010*"server" + 0.009*"data" + 0.009*"software" + 0.009*"program" + 0.007*"graphics"
INFO:gensim.models.ldamodel:topic #13 (0.050): 0.026*"space" + 0.010*"nasa" + 0.006*"earth" + 0.006*"launch" + 0.006*"orbit" 

INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #7 (0.050): 0.015*"would" + 0.013*"know" + 0.012*"like" + 0.012*"thanks" + 0.011*"anyone" + 0.009*"problem" + 0.009*"card" + 0.008*"please" + 0.008*"need" + 0.007*"system"
INFO:gensim.models.ldamodel:topic #4 (0.050): 0.012*"people" + 0.008*"said" + 0.006*"government" + 0.006*"would" + 0.006*"turkish" + 0.005*"armenian" + 0.005*"armenians" + 0.004*"killed" + 0.004*"guns" + 0.004*"right"
INFO:gensim.models.ldamodel:topic #8 (0.050): 0.023*"medical" + 0.016*"health" + 0.015*"disease" + 0.014*"cancer" + 0.013*"patients" + 0.012*"aids" + 0.010*"doctor" + 0.009*"drug" + 0.009*"treatment" + 0.008*"medicine"
INFO:gensim.models.ldamodel:topic #13 (0.050): 0.029*"space" + 0.011*"nasa" + 0.007*"earth" + 0.006*"launch" + 0.006*"orbit" + 0.006*"science" + 0.006*"shuttle" + 0.005*"moon" + 0.005*"data" + 0.005*"lunar"
INFO:gensim.models.ldamodel:topic #3 (0.050): 0.020*"b

INFO:gensim.models.ldamodel:topic #13 (0.050): 0.031*"space" + 0.012*"nasa" + 0.007*"earth" + 0.007*"launch" + 0.006*"science" + 0.006*"orbit" + 0.006*"shuttle" + 0.006*"data" + 0.005*"moon" + 0.005*"lunar"
INFO:gensim.models.ldamodel:topic #1 (0.050): 0.046*"jesus" + 0.021*"christ" + 0.018*"bible" + 0.012*"church" + 0.011*"christian" + 0.011*"john" + 0.011*"faith" + 0.010*"paul" + 0.009*"matthew" + 0.009*"father"
INFO:gensim.models.ldamodel:topic #19 (0.050): 0.013*"game" + 0.011*"team" + 0.010*"year" + 0.009*"play" + 0.008*"games" + 0.007*"last" + 0.006*"first" + 0.006*"season" + 0.006*"players" + 0.005*"good"
INFO:gensim.models.ldamodel:topic #11 (0.050): 0.028*"file" + 0.019*"output" + 0.013*"widget" + 0.013*"program" + 0.012*"color" + 0.011*"line" + 0.010*"screen" + 0.009*"image" + 0.009*"input" + 0.009*"mouse"
INFO:gensim.models.ldamodel:topic diff=0.035944, rho=0.292891
INFO:gensim.models.ldamodel:PROGRESS: pass 5, at document #6000/11314
INFO:gensim.models.ldamodel:merging chan

INFO:gensim.models.ldamodel:topic #5 (0.050): 0.016*"cursor" + 0.010*"financial" + 0.009*"saves" + 0.008*"vernon" + 0.007*"bone" + 0.006*"bounced" + 0.006*"methanol" + 0.005*"opportunities" + 0.004*"powerplay" + 0.004*"babe"
INFO:gensim.models.ldamodel:topic #8 (0.050): 0.023*"medical" + 0.018*"health" + 0.015*"disease" + 0.013*"cancer" + 0.013*"patients" + 0.012*"aids" + 0.011*"doctor" + 0.010*"drug" + 0.009*"treatment" + 0.009*"drugs"
INFO:gensim.models.ldamodel:topic diff=0.030963, rho=0.281083
INFO:gensim.models.ldamodel:PROGRESS: pass 6, at document #6000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #18 (0.050): 0.009*"good" + 0.008*"sale" + 0.008*"back" + 0.008*"power" + 0.008*"price" + 0.006*"cars" + 0.006*"like" + 0.006*"engine" + 0.005*"used" + 0.005*"offer"
INFO:gensim.models.ldamodel:topic #15 (0.050): 0.030*"israel" + 0.029*"jews" + 0.019*"israeli" + 0.015*"jewish" + 0.012*"arab" + 0.

INFO:gensim.models.ldamodel:topic diff=0.027521, rho=0.270597
INFO:gensim.models.ldamodel:PROGRESS: pass 7, at document #6000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #17 (0.050): 0.034*"bike" + 0.015*"ride" + 0.012*"motorcycle" + 0.011*"riding" + 0.010*"road" + 0.010*"bikes" + 0.009*"paradox" + 0.009*"helmet" + 0.008*"rider" + 0.007*"motorcycles"
INFO:gensim.models.ldamodel:topic #15 (0.050): 0.031*"jews" + 0.031*"israel" + 0.020*"israeli" + 0.016*"jewish" + 0.012*"arab" + 0.008*"land" + 0.007*"pain" + 0.007*"arabs" + 0.006*"peace" + 0.005*"palestinian"
INFO:gensim.models.ldamodel:topic #4 (0.050): 0.012*"people" + 0.008*"said" + 0.007*"armenian" + 0.007*"turkish" + 0.007*"government" + 0.005*"armenians" + 0.005*"would" + 0.005*"police" + 0.005*"guns" + 0.004*"killed"
INFO:gensim.models.ldamodel:topic #16 (0.050): 0.013*"windows" + 0.013*"available" + 0.012*"file" + 0.011*"files" + 0.011*"wi

INFO:gensim.models.ldamodel:topic #3 (0.050): 0.020*"pitt" + 0.019*"banks" + 0.018*"gordon" + 0.015*"surrender" + 0.015*"soon" + 0.015*"skepticism" + 0.014*"intellect" + 0.014*"cadre" + 0.013*"shameful" + 0.013*"henrik"
INFO:gensim.models.ldamodel:topic #0 (0.050): 0.016*"would" + 0.011*"people" + 0.010*"think" + 0.008*"know" + 0.008*"like" + 0.006*"even" + 0.006*"time" + 0.006*"well" + 0.006*"many" + 0.005*"believe"
INFO:gensim.models.ldamodel:topic #2 (0.050): 0.012*"part" + 0.009*"attack" + 0.008*"plastic" + 0.007*"cryptosystem" + 0.006*"courses" + 0.006*"crypt" + 0.006*"japanese" + 0.005*"plaintext" + 0.005*"cryptology" + 0.005*"cryptanalysis"
INFO:gensim.models.ldamodel:topic #9 (0.050): 0.014*"encryption" + 0.012*"chip" + 0.010*"system" + 0.010*"keys" + 0.009*"clipper" + 0.009*"privacy" + 0.008*"security" + 0.008*"government" + 0.007*"data" + 0.007*"used"
INFO:gensim.models.ldamodel:topic diff=0.031885, rho=0.261203
INFO:gensim.models.ldamodel:PROGRESS: pass 8, at document #8000/

INFO:gensim.models.ldamodel:topic #19 (0.050): 0.014*"game" + 0.012*"team" + 0.011*"year" + 0.009*"games" + 0.009*"play" + 0.008*"season" + 0.007*"last" + 0.006*"first" + 0.006*"players" + 0.005*"league"
INFO:gensim.models.ldamodel:topic #8 (0.050): 0.021*"medical" + 0.017*"health" + 0.014*"disease" + 0.014*"cancer" + 0.012*"patients" + 0.012*"doctor" + 0.010*"aids" + 0.010*"food" + 0.010*"drug" + 0.009*"treatment"
INFO:gensim.models.ldamodel:topic #6 (0.050): 0.011*"list" + 0.010*"information" + 0.010*"mail" + 0.010*"send" + 0.009*"email" + 0.008*"please" + 0.007*"internet" + 0.007*"address" + 0.007*"entry" + 0.006*"name"
INFO:gensim.models.ldamodel:topic diff=0.030015, rho=0.252724
INFO:gensim.models.ldamodel:PROGRESS: pass 9, at document #8000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #5 (0.050): 0.017*"cursor" + 0.012*"financial" + 0.012*"saves" + 0.009*"opportunities" + 0.007*"bone" + 0.0

INFO:gensim.models.ldamodel:topic #3 (0.050): 0.020*"pitt" + 0.019*"banks" + 0.019*"gordon" + 0.015*"surrender" + 0.015*"soon" + 0.015*"skepticism" + 0.014*"intellect" + 0.014*"cadre" + 0.013*"shameful" + 0.013*"henrik"
INFO:gensim.models.ldamodel:topic #13 (0.050): 0.035*"space" + 0.014*"nasa" + 0.010*"launch" + 0.008*"earth" + 0.008*"science" + 0.007*"data" + 0.006*"satellite" + 0.006*"orbit" + 0.006*"shuttle" + 0.005*"moon"
INFO:gensim.models.ldamodel:topic diff=0.028612, rho=0.245020
INFO:gensim.models.ldamodel:PROGRESS: pass 10, at document #8000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #17 (0.050): 0.037*"bike" + 0.015*"ride" + 0.011*"riding" + 0.011*"motorcycle" + 0.010*"bikes" + 0.010*"road" + 0.009*"helmet" + 0.008*"paradox" + 0.007*"rider" + 0.006*"motorcycles"
INFO:gensim.models.ldamodel:topic #12 (0.050): 0.015*"objective" + 0.014*"cross" + 0.011*"allocation" + 0.009*"colormap" + 

INFO:gensim.models.ldamodel:topic #17 (0.050): 0.035*"bike" + 0.016*"ride" + 0.012*"motorcycle" + 0.012*"riding" + 0.010*"bikes" + 0.010*"road" + 0.009*"helmet" + 0.009*"rider" + 0.009*"paradox" + 0.007*"motorcycles"
INFO:gensim.models.ldamodel:topic diff=0.027371, rho=0.237981
INFO:gensim.models.ldamodel:PROGRESS: pass 11, at document #8000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #11 (0.050): 0.038*"file" + 0.025*"output" + 0.017*"program" + 0.015*"color" + 0.013*"jpeg" + 0.013*"line" + 0.011*"screen" + 0.011*"image" + 0.011*"mouse" + 0.010*"input"
INFO:gensim.models.ldamodel:topic #10 (0.050): 0.009*"president" + 0.007*"national" + 0.007*"university" + 0.006*"april" + 0.005*"states" + 0.005*"american" + 0.005*"research" + 0.004*"public" + 0.004*"government" + 0.004*"information"
INFO:gensim.models.ldamodel:topic #1 (0.050): 0.044*"jesus" + 0.024*"bible" + 0.022*"church" + 0.021*"christian"

INFO:gensim.models.ldamodel:topic diff=0.026403, rho=0.231515
INFO:gensim.models.ldamodel:PROGRESS: pass 12, at document #8000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #10 (0.050): 0.009*"president" + 0.007*"national" + 0.007*"university" + 0.006*"april" + 0.005*"states" + 0.005*"american" + 0.005*"research" + 0.004*"public" + 0.004*"government" + 0.004*"united"
INFO:gensim.models.ldamodel:topic #7 (0.050): 0.016*"would" + 0.014*"know" + 0.012*"thanks" + 0.012*"like" + 0.012*"anyone" + 0.009*"problem" + 0.008*"please" + 0.008*"need" + 0.008*"card" + 0.007*"could"
INFO:gensim.models.ldamodel:topic #18 (0.050): 0.010*"good" + 0.008*"power" + 0.008*"back" + 0.007*"price" + 0.007*"sale" + 0.006*"used" + 0.006*"cars" + 0.006*"like" + 0.006*"engine" + 0.005*"offer"
INFO:gensim.models.ldamodel:topic #0 (0.050): 0.016*"would" + 0.011*"people" + 0.010*"think" + 0.008*"like" + 0.008*"know" + 0.007*"eve

INFO:gensim.models.ldamodel:topic #13 (0.050): 0.036*"space" + 0.014*"nasa" + 0.008*"launch" + 0.008*"science" + 0.008*"earth" + 0.007*"data" + 0.006*"satellite" + 0.006*"shuttle" + 0.006*"orbit" + 0.005*"moon"
INFO:gensim.models.ldamodel:topic #16 (0.050): 0.013*"available" + 0.012*"window" + 0.012*"windows" + 0.012*"files" + 0.011*"version" + 0.011*"software" + 0.010*"server" + 0.009*"file" + 0.009*"program" + 0.008*"also"
INFO:gensim.models.ldamodel:topic #5 (0.050): 0.017*"cursor" + 0.012*"financial" + 0.011*"saves" + 0.009*"opportunities" + 0.007*"bone" + 0.006*"vernon" + 0.005*"powerplay" + 0.005*"repost" + 0.005*"bounced" + 0.005*"barrel"
INFO:gensim.models.ldamodel:topic #14 (0.050): 0.071*"drive" + 0.034*"disk" + 0.026*"drives" + 0.026*"hard" + 0.020*"controller" + 0.017*"floppy" + 0.015*"tape" + 0.011*"bios" + 0.010*"master" + 0.010*"disks"
INFO:gensim.models.ldamodel:topic diff=0.033251, rho=0.225549
INFO:gensim.models.ldamodel:PROGRESS: pass 13, at document #10000/11314
INF

INFO:gensim.models.ldamodel:topic #12 (0.050): 0.016*"objective" + 0.014*"cross" + 0.011*"allocation" + 0.010*"brain" + 0.010*"colormap" + 0.008*"atheism" + 0.008*"unit" + 0.008*"winners" + 0.007*"column" + 0.007*"morality"
INFO:gensim.models.ldamodel:topic #14 (0.050): 0.072*"drive" + 0.034*"disk" + 0.026*"drives" + 0.026*"hard" + 0.020*"controller" + 0.017*"floppy" + 0.015*"tape" + 0.011*"bios" + 0.010*"master" + 0.010*"disks"
INFO:gensim.models.ldamodel:topic diff=0.032414, rho=0.220022
INFO:gensim.models.ldamodel:PROGRESS: pass 14, at document #10000/11314
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 11314 documents
INFO:gensim.models.ldamodel:topic #14 (0.050): 0.074*"drive" + 0.036*"disk" + 0.027*"drives" + 0.026*"hard" + 0.022*"controller" + 0.019*"tape" + 0.019*"floppy" + 0.013*"disks" + 0.010*"bios" + 0.009*"master"
INFO:gensim.models.ldamodel:topic #15 (0.050): 0.031*"israel" + 0.030*"jews" + 0.021*"israeli" + 0.015*"jewish" + 0.011*"arab" +

(0, '0.017*"would" + 0.012*"people" + 0.010*"think" + 0.008*"like"')
(1, '0.042*"jesus" + 0.025*"christian" + 0.024*"bible" + 0.021*"church"')
(2, '0.012*"part" + 0.011*"plastic" + 0.009*"byte" + 0.007*"outlets"')
(3, '0.027*"henrik" + 0.021*"pitt" + 0.020*"gordon" + 0.019*"banks"')
(4, '0.013*"people" + 0.010*"said" + 0.009*"armenian" + 0.008*"armenians"')
(5, '0.019*"cursor" + 0.014*"financial" + 0.010*"vernon" + 0.008*"bone"')
(6, '0.011*"mail" + 0.010*"information" + 0.010*"list" + 0.010*"send"')
(7, '0.015*"would" + 0.014*"know" + 0.012*"thanks" + 0.012*"like"')
(8, '0.021*"health" + 0.021*"medical" + 0.015*"disease" + 0.013*"patients"')
(9, '0.013*"encryption" + 0.012*"chip" + 0.011*"keys" + 0.010*"clipper"')
(10, '0.009*"president" + 0.007*"national" + 0.006*"university" + 0.006*"april"')
(11, '0.038*"file" + 0.029*"output" + 0.021*"program" + 0.014*"line"')
(12, '0.023*"objective" + 0.012*"cross" + 0.011*"brain" + 0.010*"winners"')
(13, '0.036*"space" + 0.013*"nasa" + 0.009*"sc

#### LDA 시각화

In [52]:
!pip install pyLDAvis



In [53]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

#### 문서별 토픽 확인하기

In [54]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [55]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,0.0,0.4572,"[(0, 0.45716968), (4, 0.08345449), (15, 0.4456..."
1,1,0.0,0.7558,"[(0, 0.7558315), (1, 0.036971554), (4, 0.03056..."
2,2,0.0,0.5275,"[(0, 0.52745134), (4, 0.030790979), (15, 0.357..."
3,3,0.0,0.4212,"[(0, 0.42123944), (7, 0.09114002), (9, 0.31295..."
4,4,19.0,0.375,"[(0, 0.24494436), (6, 0.11286654), (18, 0.2375..."
5,5,0.0,0.4515,"[(0, 0.4514687), (1, 0.14265071), (8, 0.048209..."
6,6,7.0,0.4752,"[(0, 0.23168655), (2, 0.04168815), (7, 0.47519..."
7,7,0.0,0.6788,"[(0, 0.67875683), (3, 0.017004102), (7, 0.0297..."
8,8,0.0,0.3394,"[(0, 0.33938193), (7, 0.13510261), (8, 0.31704..."
9,9,18.0,0.4774,"[(0, 0.23562187), (4, 0.017261174), (7, 0.1881..."
