#### 토픽 모델링
- 머신러닝 기반의 토픽 모델링을 적용해 문서 집합에 숨어 있는 주제를 찾아냄
- 사람이 수행하는 토픽 모델링은 더 함축적인 의미로 문장을 요약하는 것에 반해 머신러닝 기반의 토픽 모델링은 숨겨진 주제를 효과적으로 표현할 수 있는 중심 단어를 함축적으로 추출
- LSA(Latent Sementic Analysis) 와 LDA(Latent Dirichlet Allocation) 기법
 - LSA는 단어-문서행렬(Word-Document Matrix), 단어-문맥행렬(window based co-occurrence matrix) 등 입력 데이터에 특이값 분해를 수행해 데이터의 차원수를 줄여 계산 효율성을 키우면서 행간에 숨어있는(latent) 의미를 이끌어내기 위한 방법론
 - LDA는 미리 알고 있는 주제별 단어수 분포를 바탕으로, 주어진 문서에서 발견된 단어수 분포를 분석, 해당 문서가 어떤 주제들을 함께 다루고 있을지를 예측

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# 모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 전자공학, 의학 8개 주제를 추출.
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', \
        'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian',\
        'sci.electronics', 'sci.med'  ]
# 위에서 cats 변수로 기재된 category만 추출. featch_20newsgroups( )의 
# categories에 cats 입력
news_df = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),
                            categories = cats, random_state=0)
# LDA 는 Count기반의 Vectorizer만 적용
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2,\
                            stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print(feat_vect.shape)
print(news_df.data[0])
print(feat_vect[0].toarray())



(7862, 1000)


In [19]:
feat_vect.toarray()[0]

0

In [2]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [9]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [10]:
# argsort() 넘파이 배열의 원소를 오름차순으로 정렬하는 메소드입니다.

import numpy as np
d1 = np.arange(10,25)
print(d1)
d2 = d1.argsort()
print(d2)
topic_word_indexes = d1.argsort()[::-1]
print(topic_word_indexes)
top_indexes=topic_word_indexes[:10]
top_indexes

[10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]


array([14, 13, 12, 11, 10,  9,  8,  7,  6,  5], dtype=int64)

In [12]:
def display_topics(model,feature_names,no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)
        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출        
feature_names = count_vect.get_feature_names()
# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)

# cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', \
#         'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian',\
#         'sci.electronics', 'sci.med'  ]

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


In [5]:
# 설명
# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출 
feature_names = count_vect.get_feature_names()
for topic_index, topic in enumerate(lda.components_):
    print('Topic #', topic_index)
    # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array 
    # index를 반환. 

    topic_word_indexes = topic.argsort()[::-1]
    top_indexes=topic_word_indexes[:15]
    # top_indexes대상인 index별로 feature_names에 해당하는 15개 word feature 
    # 추출 후 공백으로 concat
    feature_concat = ' '.join([feature_names[i] for i in top_indexes])
    print(feature_concat)
   

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


In [None]:
# news20_df = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),random_state=0)
# print(news20_df.keys())
# news20_df.target_names


#### Q. fetch_20newsgroups의 나머지 12개 주제에 대하여 토픽 모델링을 수행하세요.

In [28]:
# 풀이
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cats = ['alt.atheism', 
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc']

news_df1 = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),
                            categories = cats, random_state=0)
# LDA 는 Count기반의 Vectorizer만 적용
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2,\
                            stop_words='english', ngram_range=(1,2))
feat_vect1 = count_vect.fit_transform(news_df1.data)
print(feat_vect1.shape)

(11351, 1000)


In [29]:
lda1 = LatentDirichletAllocation(n_components=12, random_state=0)
lda1.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=12, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [30]:
print(lda1.components_.shape)
lda1.components_

(12, 1000)


array([[4.61616519e+01, 1.11433015e+02, 2.75003954e+01, ...,
        1.79966625e-01, 5.09556541e+01, 4.09990335e+01],
       [8.33363981e-02, 3.89064532e+00, 8.33338221e-02, ...,
        9.87765364e+01, 8.33362623e-02, 7.67117326e+01],
       [6.81891664e+01, 8.33342728e-02, 6.28391463e+00, ...,
        8.33354345e-02, 8.33349832e-02, 8.33338653e-02],
       ...,
       [3.90538872e+00, 5.40268593e+00, 1.55205604e+01, ...,
        1.68604123e+01, 7.47217834e-01, 1.20896552e+01],
       [8.33353978e-02, 5.97407644e+01, 8.33333878e-02, ...,
        2.75009142e+01, 6.42609957e+00, 1.41118925e+01],
       [8.33358855e-02, 3.82923785e+01, 8.33365553e-02, ...,
        6.99634870e+01, 1.19031834e+01, 4.46096937e+01]])

In [31]:
def display_topics(model,feature_names,no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)
        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출        
feature_names = count_vect.get_feature_names()
# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda1, feature_names, 15)

# cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', \
#         'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian',\
#         'sci.electronics', 'sci.med'  ]

Topic # 0
14 means happened 17 24 deleted building 25 pass 16 54 anti heat unfortunately 30
Topic # 1
kind did run jobs day light people wanted technology game results safety david test logic
Topic # 2
ibm external job printer citizens extra ice form federal option drives bible unfortunately uses private
Topic # 3
people tax did light jobs kind details gas protect technology vs question plan tape batf
Topic # 4
difficult digital tor gm type took 00 network hear opinions involved york gives new prices
Topic # 5
door gets civil mail including scsi signal lines files asking com copy takes process dr
Topic # 6
apr april people tor appropriate attack rules follow generally run software white keys case country
Topic # 7
unfortunately light jobs better users unit necessary details takes did gas technology wire ve seen make
Topic # 8
ftp isn chance cd center certainly equipment believe set past batf lower man lot details
Topic # 9
ways secure unfortunately months des washington external steve 