# 토픽 모델링

- 문서들의 주제를 찾아내는 기법

### 알고리즘 유형
- 행렬분해 기반과 확률기반 토픽 모델링이 있다.
- LSA
- LDA
- NMF


### 20 Newsgroup 토픽 모델링

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med']
news_df = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),
                             categories=cats, random_state=0)

count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1, 2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:', feat_vect)

CountVectorizer Shape:   (0, 93)	1
  (0, 669)	1
  (0, 390)	1
  (0, 148)	1
  (0, 251)	1
  (0, 876)	1
  (0, 70)	1
  (0, 877)	1
  (1, 390)	1
  (1, 428)	1
  (1, 391)	1
  (1, 237)	1
  (1, 607)	1
  (1, 403)	1
  (1, 955)	2
  (1, 512)	2
  (1, 678)	2
  (1, 655)	2
  (1, 881)	2
  (1, 733)	1
  (1, 688)	1
  (1, 23)	1
  (1, 894)	1
  (1, 15)	1
  (1, 12)	1
  :	:
  (7858, 61)	3
  (7858, 864)	2
  (7858, 133)	1
  (7859, 511)	1
  (7859, 528)	1
  (7859, 782)	1
  (7859, 773)	1
  (7859, 54)	1
  (7859, 666)	1
  (7859, 159)	1
  (7859, 387)	1
  (7859, 126)	1
  (7860, 876)	1
  (7860, 70)	1
  (7860, 877)	1
  (7860, 428)	1
  (7860, 678)	1
  (7860, 922)	1
  (7860, 243)	1
  (7860, 795)	1
  (7860, 911)	1
  (7860, 682)	1
  (7860, 909)	1
  (7860, 490)	1
  (7861, 973)	1


In [3]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(n_components=8, random_state=0)

In [4]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

#### 각 토픽별 중심단어 확인

In [5]:
def display_topic_words(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('\nTopic #', topic_index)
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes = topic_word_indexes[:no_top_words]
        
        feature_concat = ' '.join([str(feature_names[i]) for i in top_indexes])
        print(feature_concat)


In [6]:
feature_names = count_vect.get_feature_names()
display_topic_words(lda, feature_names, 15)



Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good

Topic # 1
don just like know people said think time ve didn right going say ll way

Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03

Topic # 3
like know don think use does just good time book read information people used post

Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000

Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun

Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life

Topic # 7
use dos thanks windows using window does display help like problem server need know run




#### 개별 문서별 토픽 분포 확인

In [7]:
doc_topics = lda.transform(feat_vect)
print(doc_topics.shape)
print(doc_topics[:3])

(7862, 8)
[[0.01389701 0.01394362 0.01389104 0.48221844 0.01397882 0.01389205
  0.01393501 0.43424401]
 [0.27750436 0.18151826 0.0021208  0.53037189 0.00212129 0.00212102
  0.00212113 0.00212125]
 [0.00544459 0.22166575 0.00544539 0.00544528 0.00544039 0.00544168
  0.00544182 0.74567512]]


In [12]:
def get_filename_list(newsdata):
    filename_list = []
    
    for file in newsdata.filenames:
        filename_temp = file.split('/')[-2:]
        filename = '.'.join(filename_temp)
        filename_list.append(filename)
    
    return filename_list

In [13]:
filename_list = get_filename_list(news_df)
print('filename 개수:', len(filename_list), 'filename list 10개만:', filename_list[:10])

filename 개수: 7862 filename list 10개만: ['soc.religion.christian.20630', 'sci.med.59422', 'comp.graphics.38765', 'comp.graphics.38810', 'sci.med.59449', 'comp.graphics.38461', 'comp.windows.x.66959', 'rec.motorcycles.104487', 'sci.electronics.53875', 'sci.electronics.53617']


In [14]:
import pandas as pd

In [15]:
topic_names = ['Topic #' + str(i) for i in range(0, 8)]
doc_topic_df = pd.DataFrame(data=doc_topics, columns=topic_names, index=filename_list)
doc_topic_df.head(20)

Unnamed: 0,Topic #0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7
soc.religion.christian.20630,0.013897,0.013944,0.013891,0.482218,0.013979,0.013892,0.013935,0.434244
sci.med.59422,0.277504,0.181518,0.002121,0.530372,0.002121,0.002121,0.002121,0.002121
comp.graphics.38765,0.005445,0.221666,0.005445,0.005445,0.00544,0.005442,0.005442,0.745675
comp.graphics.38810,0.005439,0.005441,0.005449,0.578959,0.00544,0.388387,0.005442,0.005442
sci.med.59449,0.006584,0.552,0.006587,0.408485,0.006585,0.006585,0.006588,0.006585
comp.graphics.38461,0.008342,0.008352,0.182622,0.767314,0.008335,0.008341,0.008343,0.008351
comp.windows.x.66959,0.372861,0.041667,0.37702,0.041668,0.041703,0.041703,0.041667,0.041711
rec.motorcycles.104487,0.225351,0.674669,0.004814,0.07592,0.004812,0.004812,0.004812,0.00481
sci.electronics.53875,0.008944,0.836686,0.008932,0.008941,0.008935,0.109691,0.008932,0.008938
sci.electronics.53617,0.041733,0.04172,0.708081,0.041742,0.041671,0.041669,0.041699,0.041686
