#### 토픽 모델링
- 머신러닝 기반의 토픽 모델링을 적용해 문서 집합에 숨어 있는 주제를 찾아냄
- 사람이 수행하는 토픽 모델링은 더 함축적인 의미로 문장을 요약하는 것에 반해 머신러닝 기반의 토픽 모델링은 숨겨진 주제를 효과적으로 표현할 수 있는 중심 단어를 함축적으로 추출
- LSA(Latent Sementic Analysis) 와 LDA(Latent Dirichlet Allocation) 기법
 - LSA는 단어-문서행렬(Word-Document Matrix), 단어-문맥행렬(window based co-occurrence matrix) 등 입력 데이터에 특이값 분해를 수행해 데이터의 차원수를 줄여 계산 효율성을 키우면서 행간에 숨어있는(latent) 의미를 이끌어내기 위한 방법론
 - LDA는 미리 알고 있는 주제별 단어수 분포를 바탕으로, 주어진 문서에서 발견된 단어수 분포를 분석, 해당 문서가 어떤 주제들을 함께 다루고 있을지를 예측

In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# 모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 전자공학, 의학 8개 주제를 추출.
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', \
        'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian',\
        'sci.electronics', 'sci.med'  ]
# 위에서 cats 변수로 기재된 category만 추출. featch_20newsgroups( )의 
# categories에 cats 입력

news_df = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),
                            categories = cats, random_state=0)
print(news_df.keys())
# LDA 는 Count기반의 Vectorizer만 적용
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2,\
                            stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print(feat_vect.shape)


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
(7862, 1000)


In [12]:
print(news_df.data[0])

I appreciate if anyone can point out some good books about the dead sea
scrolls of Qumran. Thanks in advance.


In [13]:
# feat_vect.toarray()[0]
print(feat_vect[0].toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [14]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [15]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [35]:
# argsort() 넘파이 배열의 원소를 오름차순으로 정렬하는 메소드입니다.

import numpy as np
d1 = np.arange(10,25)
print(d1)
d2 = topic.argsort()
print(d2)
topic_word_indexes = topic.argsort()[::-1]
print(topic_word_indexes)
top_indexes=topic_word_indexes[:15]
top_indexes

[10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]


array([14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0],
      dtype=int64)

In [22]:
# 설명
# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출 
feature_names = count_vect.get_feature_names()
for topic_index, topic in enumerate(lda.components_):
    print('Topic #', topic_index)
    # argsort()[::-1] : components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
#     topic_word_indexes = topic.argsort()
    topic_word_indexes = topic.argsort()[::-1]
    top_indexes=topic_word_indexes[:15]
    # top_indexes대상인 index별로 feature_names에 해당하는 15개 word feature 추출 후 공백으로 concat
    feature_concat = ' '.join([feature_names[i] for i in top_indexes])
    print(feature_concat)
   

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


In [16]:
# 토픽별로 연관도가 높은 순으로 word 나열
def display_topics(model,feature_names,no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)
        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출        
feature_names = count_vect.get_feature_names()
# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)

# cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', \
#         'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian',\
#         'sci.electronics', 'sci.med'  ]

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


In [None]:
# news20_df = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),random_state=0)
# print(news20_df.keys())
# news20_df.target_names


#### Q. fetch_20newsgroups으로 다음 작업을 수행하세요.
- TfidfVectorizer 방식으로 벡터 처리하고 lr 알고리즘으로 precision을 포함하여 평가하세요. 
- precision 기준으로 평가지수가 높은순으로 5개 그룹을 선정하여 토픽 모델링을 수행하세요.
- 텍스트 분류 예측 정밀도와 그룹별 토픽 모델링 성능간의 상관관계를 기술하세요.

In [17]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', random_state=156)

In [18]:
import pandas as pd
print(news_data.target_names)
# 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'talk.politics.mideast'
# print(news_data.target[10])
# print(news_data.data[10])

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [19]:
train_news = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),
                  random_state=156)
X_train = train_news.data
y_train = train_news.target
test_news = fetch_20newsgroups(subset='test',remove=('header','footers','quotes'),
                              random_state=156)
X_test = test_news.data
y_test = test_news.target

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect =TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings('ignore')

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
lr_pred = lr_clf.predict(X_test_tfidf_vect)
print(accuracy_score(y_test, lr_pred))
rp = metrics.classification_report(y_test,lr_pred)
print(rp)

0.710169941582581
              precision    recall  f1-score   support

           0       0.63      0.47      0.54       319
           1       0.57      0.76      0.65       389
           2       0.67      0.71      0.69       394
           3       0.73      0.61      0.66       392
           4       0.84      0.64      0.73       385
           5       0.73      0.70      0.71       395
           6       0.58      0.87      0.70       390
           7       0.90      0.66      0.76       396
           8       0.77      0.81      0.79       398
           9       0.87      0.79      0.83       397
          10       0.89      0.92      0.90       399
          11       0.86      0.79      0.82       396
          12       0.44      0.70      0.54       393
          13       0.81      0.72      0.76       396
          14       0.68      0.86      0.76       394
          15       0.71      0.74      0.72       398
          16       0.62      0.77      0.69       364
         

In [66]:
news_df = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),
                            random_state=0)
news_df.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [28]:
# 풀이
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# cats = ['comp.sys.mac.hardware',
#         'comp.windows.x',
#         'rec.sport.baseball',
#         'rec.sport.hockey',
#         'misc.forsale']
cats = [ 'rec.autos','rec.sport.baseball','rec.sport.hockey','sci.crypt','comp.sys.mac.hardware']
news_df1 = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'),
                            categories = cats, random_state=0)
# LDA 는 Count기반의 Vectorizer만 적용
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2,\
                            stop_words='english', ngram_range=(1,2))
feat_vect1 = count_vect.fit_transform(news_df1.data)
print(feat_vect1.shape)

(4937, 1000)


In [29]:
lda1 = LatentDirichletAllocation(n_components=5, random_state=0)
lda1.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [30]:
print(lda1.components_.shape)
lda1.components_

(5, 1000)


array([[3.00733268e+02, 1.78078850e+02, 1.59211399e+02, ...,
        7.02379821e+01, 1.36644481e+02, 6.87745839e+01],
       [2.04575909e-01, 1.52846824e+00, 2.00809893e-01, ...,
        2.27914060e+02, 2.01776472e-01, 1.14964570e+02],
       [1.67786133e+02, 1.31913391e+01, 2.31865363e+01, ...,
        4.66539424e+01, 1.18866643e+01, 1.29867260e+01],
       [2.01405825e-01, 2.01590471e-01, 2.00301376e-01, ...,
        1.36484532e+02, 6.67487338e-01, 2.54274203e+01],
       [6.07461685e+00, 3.27999753e+02, 2.00953890e-01, ...,
        2.07094842e+01, 1.46599590e+02, 2.98467003e+01]])

In [31]:
def display_topics(model,feature_names,no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)
        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출        
feature_names = count_vect.get_feature_names()
# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda1, feature_names, 15)

# cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', \
#         'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian',\
#         'sci.electronics', 'sci.med'  ]

Topic # 0
years 06 trust form feel needs talk manual years ago try great night 1992 1993 happened
Topic # 1
defense jays jose lemieux performance systems usa talk round result run court video ford course
Topic # 2
division exactly trust hockey press ftp agency cards week server faster issues use scorer example
Topic # 3
following performance days systems internet business answer product btw run jose buf defense pitchers jays
Topic # 4
_____ instead ability islanders performance time involved insurance friend various design throw 87 round __


In [None]:
cats = [ 'rec.autos','rec.sport.baseball','rec.sport.hockey','sci.crypt','comp.sys.mac.hardware']