<a href="https://colab.research.google.com/github/lala991204/ML-self-study/blob/master/8_6_Topic_Modeling_20_Newsgroup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 전자공학, 의학 (8가지 주제)
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med']

# 위에서 cats 변수로 기재된 category만 추출
news_df = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),
                             categories=cats, random_state=0)

# LDA(Latent Dirichlet Allocation)는 Count기반의 Vectorizer만 적용
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english',
                             ngram_range=(1,2))
# max_df=0.95는 "문서의 95% 초과로 나타나는 용어 ​​무시"를 의미함(부동소수점-> 비중)
# min_df=2는 "문서의 2번 미만으로 나타나는 용어 무시"를 의미함(정수-> 횟수)

feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape: ', feat_vect.shape)

CountVectorizer Shape:  (7862, 1000)


In [4]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(n_components=8, random_state=0)

In [5]:
# components_는 개별 토픽별로 각 word 피처가 얼마나 많이 그 토픽에 할당되었는지에 대한 수치를 가짐
print(lda.components_.shape)
lda.components_             

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [6]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)

        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index 반환
        topic_word_indexes = topic.argsort()[::-1]      # 처음부터 끝까지 -1칸 간격으로 (== 역순으로)
        top_indexes = topic_word_indexes[:no_top_words]

        # top_indexes 대상인 index 별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        # join은 모든 반복 자료형의 데이터를 하나의 문자열로 합치는 기능 수행
        # join 함수는 반드시 문자열 사이를 구분하기 위한 구분자를 지정해야함
        # 아무것도 입력하고 싶지 않으면 공백 넣으면 됨.

        print(feature_concat)

# CountVectorizer 객체 내의 전체 word들의 명칭을 get_features_names()를 통해 추출
feature_names = count_vect.get_feature_names()

# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run




In [7]:
feature_names = count_vect.get_feature_names()
feature_names



['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '10',
 '100',
 '11',
 '12',
 '128',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1990',
 '1991',
 '1992',
 '1993',
 '20',
 '200',
 '21',
 '22',
 '23',
 '24',
 '24 bit',
 '25',
 '256',
 '26',
 '27',
 '28',
 '29',
 '30',
 '300',
 '31',
 '32',
 '35',
 '3d',
 '40',
 '44',
 '50',
 '500',
 '60',
 '80',
 '800',
 '90',
 '91',
 '92',
 '93',
 'ability',
 'able',
 'ac',
 'accept',
 'accepted',
 'access',
 'according',
 'act',
 'action',
 'actions',
 'acts',
 'actually',
 'add',
 'added',
 'addition',
 'address',
 'adl',
 'advance',
 'age',
 'ago',
 'agree',
 'aids',
 'al',
 'allow',
 'american',
 'amiga',
 'analysis',
 'anonymous',
 'anonymous ftp',
 'answer',
 'answers',
 'anti',
 'anybody',
 'apartment',
 'apparently',
 'appear',
 'appears',
 'application',
 'applications',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'april',
 'arab',
 'arabs',
 'archive',
 'area',
 'areas',
 'aren',
 'argic',
 'argument',
 'arm