In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import os, sys  
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\Common_Module")

import CMNLP as CMNLP

#모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 전자공학, 의학 8개 주제를 추출
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x', 'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med']

#위에서 cats변수로 기재된 카테고리만 추출, fetch_20newsgrouops()의 categories에 cats입력
news_df = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=cats, random_state=0)

#LDA는 Count기반의 벡터화만 적용합니다.
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
fect_vect = count_vect.fit_transform(news_df.data)
print(fect_vect)
print(fect_vect.shape)

  (0, 93)	1
  (0, 670)	1
  (0, 391)	1
  (0, 148)	1
  (0, 252)	1
  (0, 876)	1
  (0, 70)	1
  (0, 877)	1
  (1, 391)	1
  (1, 429)	1
  (1, 392)	1
  (1, 238)	1
  (1, 608)	1
  (1, 404)	1
  (1, 955)	2
  (1, 513)	2
  (1, 679)	2
  (1, 656)	2
  (1, 881)	2
  (1, 734)	1
  (1, 689)	1
  (1, 23)	1
  (1, 894)	1
  (1, 15)	1
  (1, 12)	1
  :	:
  (7858, 61)	3
  (7858, 864)	2
  (7858, 133)	1
  (7859, 512)	1
  (7859, 529)	1
  (7859, 782)	1
  (7859, 773)	1
  (7859, 54)	1
  (7859, 667)	1
  (7859, 159)	1
  (7859, 388)	1
  (7859, 126)	1
  (7860, 876)	1
  (7860, 70)	1
  (7860, 877)	1
  (7860, 429)	1
  (7860, 679)	1
  (7860, 922)	1
  (7860, 244)	1
  (7860, 795)	1
  (7860, 911)	1
  (7860, 683)	1
  (7860, 909)	1
  (7860, 491)	1
  (7861, 973)	1
(7862, 1000)


In [4]:
print('lda\n')
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(fect_vect)
print(lda.components_)
print(lda.components_.shape)

lda

[[2.46251560e+02 1.18842248e+02 1.51715288e+02 ... 1.00147234e+02
  7.63673375e+01 1.17028758e+02]
 [1.25033020e-01 1.25052288e-01 1.25003012e-01 ... 1.10644583e+02
  1.51405141e-01 5.09788954e+01]
 [1.25103419e-01 1.25075224e-01 1.25082214e-01 ... 6.72008817e+01
  1.25138615e-01 2.48516614e+00]
 ...
 [1.05055615e+02 4.94858011e-01 2.52075927e+01 ... 1.80695744e+01
  1.25115936e-01 8.33321314e+00]
 [1.25147502e-01 2.27058083e+02 5.45176328e+00 ... 1.41751120e+00
  7.67217701e+01 4.49861794e+01]
 [1.25096012e-01 4.05666840e+00 1.25049904e-01 ... 1.63821915e+02
  1.25049991e-01 1.49550227e-01]]
(8, 1000)


In [5]:
#CountVectorizer객체 내의 전체 word의 명칭을 get_feature_names()을 통해 추출
feature_names = count_vect.get_feature_names()
print(feature_names)

#토픽별 가장 연관도가 높은 word를 15개만 추출
CMNLP.display_topics(lda, feature_names, 15)

['00', '000', '01', '02', '03', '04', '05', '10', '100', '11', '12', '128', '13', '14', '15', '16', '17', '18', '19', '1990', '1991', '1992', '1993', '20', '200', '21', '22', '23', '24', '24 bit', '25', '256', '26', '27', '28', '29', '30', '300', '31', '32', '35', '3d', '40', '44', '50', '500', '60', '80', '800', '90', '91', '92', '93', 'ability', 'able', 'ac', 'accept', 'accepted', 'access', 'according', 'act', 'action', 'actions', 'acts', 'actually', 'add', 'added', 'addition', 'address', 'adl', 'advance', 'age', 'ago', 'agree', 'aids', 'al', 'allow', 'american', 'amiga', 'analysis', 'anonymous', 'anonymous ftp', 'answer', 'answers', 'anti', 'anybody', 'apartment', 'apparently', 'appear', 'appears', 'application', 'applications', 'apply', 'appreciate', 'appreciated', 'approach', 'appropriate', 'april', 'arab', 'arabs', 'archive', 'area', 'areas', 'aren', 'argic', 'argument', 'armenia', 'armenian', 'armenians', 'army', 'art', 'article', 'articles', 'ask', 'asked', 'assume', 'attack', 