# **20 Newsgroup 토픽 모델링**

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
# 추출할 8개 주제 설정
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med']

news_df = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),
                             categories=cats, random_state=0)

# LDA
count_vect = CountVectorizer(max_df=0.95, max_features=1000, stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
feat_vect.shape

(7862, 1000)

# LDA 객체 생성

In [4]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

# 각 토픽 모델링 주제별 단어들의 연관도 

lda.components_ : 각 주제별로 단어가 나타난 횟수를 정규화하여 나타냄

In [5]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[2.22405283e+02, 1.77372464e+02, 6.86914161e+01, ...,
        3.48628528e+01, 9.02767098e+01, 7.88163871e+01],
       [1.25116973e-01, 1.42319956e+01, 1.25016947e-01, ...,
        7.29092885e+01, 1.27731985e-01, 9.49697982e+01],
       [1.61064188e+02, 1.25052940e-01, 1.01576563e+02, ...,
        1.35947641e-01, 2.73125240e+01, 1.25018926e-01],
       ...,
       [1.14656870e+01, 4.58796342e+00, 1.21068031e+01, ...,
        2.40357886e+01, 1.25084161e-01, 1.25873676e+01],
       [1.25047046e-01, 1.61546077e-01, 1.25007292e-01, ...,
        9.21887199e+01, 2.62926370e-01, 3.95761177e+01],
       [1.25079735e-01, 3.65902971e-01, 1.25050000e-01, ...,
        2.44115791e+02, 1.25085873e-01, 5.09239627e-01]])

# 각 토픽별 중심 단어 확인

In [14]:
def display_topic_words(model, feature_names, num_top_words):
  for topic_index, topic in enumerate(model.components_):
    print('\nTopic :', topic_index)

    # components_ array에서 값의 크기 순서대로 정렬
    topic_word_indexes = topic.argsort()[::-1]
    top_indexes = topic_word_indexes[:num_top_words]

    # top_indexes 인 index 별 feature_name에 해당하는 word feature추출 후 concat
    feature_concat = ' + '.join([str(feature_names[i])+':'+str(round(topic[i],1)) for i in top_indexes]) 
    print(feature_concat)

In [8]:
# CountVectorizer 객체 내의 전체 word의 명칭 반환
feature_names = count_vect.get_feature_names()

In [15]:
# Topic 별 가장 연관도가 높은 word 15개 추출
display_topic_words(lda, feature_names, 15)


Topic : 0
year:743.0 + 10:586.4 + game:478.6 + medical:411.7 + health:377.9 + team:348.7 + 12:346.3 + years:336.4 + 20:336.0 + disease:332.2 + games:322.0 + cancer:319.8 + patients:303.1 + 92:291.8 + 1993:283.6

Topic : 1
said:946.0 + people:678.3 + know:669.3 + didn:604.2 + just:526.7 + don:515.6 + went:470.3 + did:427.2 + time:418.3 + like:412.4 + told:406.8 + came:385.8 + say:371.3 + going:364.2 + saw:351.5

Topic : 2
image:1018.9 + file:990.4 + jpeg:799.1 + program:606.2 + color:555.8 + output:476.4 + gif:466.8 + format:432.0 + images:425.6 + files:414.8 + bit:412.0 + entry:385.1 + use:374.1 + 03:259.9 + display:258.8

Topic : 3
edu:1286.9 + graphics:831.5 + information:738.6 + mail:689.0 + data:580.5 + available:523.5 + software:490.3 + send:458.7 + ftp:441.7 + computer:436.3 + list:425.4 + com:407.3 + thanks:369.5 + pub:365.1 + info:312.9

Topic : 4
armenian:854.6 + israel:813.7 + turkish:686.1 + jews:681.3 + armenians:582.8 + people:547.6 + israeli:476.1 + jewish:455.0 + govern

# **개별 문서별 토픽 분포**

In [16]:
# lda 객체의 transform을 수행하면 개별 문서별 토픽 분포 반환
doc_topics = lda.transform(feat_vect)
doc_topics.shape

(7862, 8)

In [17]:
doc_topics[:3]

array([[0.01390226, 0.15703927, 0.01389278, 0.75951861, 0.0139043 ,
        0.01389988, 0.01391705, 0.01392584],
       [0.33838215, 0.12597903, 0.00212104, 0.14779464, 0.00212181,
        0.00212043, 0.00212179, 0.3793591 ],
       [0.0054428 , 0.00544057, 0.19485596, 0.00543966, 0.00543947,
        0.00544808, 0.00543927, 0.77249419]])

.filenames 속성은 모든 문서의 문서명을 갖고있음

In [41]:
# 개별 문서별 토픽 분포도
def get_filename_list(newsdata):
    filename_list=[]

    for file in newsdata.filenames:
      filename_temp = file.split('/')[-2:]
      filename = ': '.join(filename_temp)
      filename_list.append(filename)

    return filename_list

In [42]:
filename_list = get_filename_list(news_df)
print("filename 개수:",len(filename_list))
print("filename list 10개:",filename_list[:10])

filename 개수: 7862
filename list 10개: ['soc.religion.christian: 20630', 'sci.med: 59422', 'comp.graphics: 38765', 'comp.graphics: 38810', 'sci.med: 59449', 'comp.graphics: 38461', 'comp.windows.x: 66959', 'rec.motorcycles: 104487', 'sci.electronics: 53875', 'sci.electronics: 53617']


In [43]:
import pandas as pd 

topic_names = ['Topic #'+ str(i) for i in range(0, 8)]
doc_topic_df = pd.DataFrame(data=doc_topics, columns=topic_names, index=filename_list)
doc_topic_df.head(20)

Unnamed: 0,Topic #0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7
soc.religion.christian: 20630,0.013902,0.157039,0.013893,0.759519,0.013904,0.0139,0.013917,0.013926
sci.med: 59422,0.338382,0.125979,0.002121,0.147795,0.002122,0.00212,0.002122,0.379359
comp.graphics: 38765,0.005443,0.005441,0.194856,0.00544,0.005439,0.005448,0.005439,0.772494
comp.graphics: 38810,0.005439,0.005439,0.108926,0.387692,0.00544,0.005443,0.005444,0.476177
sci.med: 59449,0.006583,0.15968,0.006609,0.006604,0.006585,0.006585,0.00659,0.800764
comp.graphics: 38461,0.00834,0.008342,0.173868,0.00835,0.008335,0.008342,0.008344,0.77608
comp.windows.x: 66959,0.36812,0.041669,0.381762,0.041704,0.041708,0.041704,0.041667,0.041667
rec.motorcycles: 104487,0.236499,0.004811,0.004812,0.004813,0.004812,0.004809,0.004812,0.734632
sci.electronics: 53875,0.008944,0.420458,0.008933,0.155856,0.008935,0.008942,0.008932,0.379
sci.electronics: 53617,0.041734,0.041678,0.7081,0.041667,0.041671,0.041676,0.041708,0.041766
