# step 1 : 텍스트 데이터 다운로드

In [25]:
from sklearn.datasets import fetch_20newsgroups

# 컴퓨터, 야구, 의학 분야 뉴스 텍스트 데이터
newsgroups = fetch_20newsgroups(
    categories=['comp.graphics', 'rec.sport.baseball', 'sci.med'])

# step 2 : 명사 추출

In [26]:
%%time

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

tagged_list = [pos_tag(word_tokenize(doc)) for doc in newsgroups.data]
noun_list = [[t[0] for t in doc if t[1].startswith('N')] for doc in tagged_list]

Wall time: 21.9 s


# step 3 : 표제어 추출

복수를 단수로 바꿈

In [28]:
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()

noun_list = [[lm.lemmatize(w, pos='n') for w in doc] for doc in noun_list]

# step 4 : 불용어 제거

In [34]:
import re

token_list = [[text.lower() for text in doc] for doc in noun_list]
token_list = [[re.sub("^[A-Za-z]", '', word) for word in doc]
             for doc in token_list]

In [36]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words += ["", "subject", "article", "line", "year", "month", "address",
              "keyword", "msg"]

token_list = [[word for word in doc
              if (word not in stop_words) and (2 < len(word) < 10)]
             for doc in token_list]

# step 5 : 토픽 모델링

In [37]:
from gensim import corpora

dictionary = corpora.Dictionary(token_list)
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in token_list]
doc_term_matrix

[[(0, 1),
  (1, 3),
  (2, 1),
  (3, 3),
  (4, 1),
  (5, 1),
  (6, 3),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 4),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 4),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 1),
  (30, 2),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(14, 1),
  (33, 1),
  (35, 1),
  (38, 1),
  (39, 1),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1)],
 [(14, 1),
  (35, 1),
  (43, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 2),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 3),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 2),
  (79, 2),
  (80, 1),
  (81, 1),
  (82, 1)],
 [(14, 1),
  (35, 1),
  (36,

In [38]:
%%time
from gensim.models.ldamodel import LdaModel

model = LdaModel(corpus = doc_term_matrix,
                id2word=dictionary,
                num_topics = 3)

Wall time: 2.54 s


In [39]:
model.print_topics()

[(0,
  '0.014*"ines" + 0.013*"ubject" + 0.007*"niversity" + 0.007*"mage" + 0.006*"ear" + 0.006*"rticle" + 0.005*"ile" + 0.005*"rogram" + 0.004*"ime" + 0.004*"nyone"'),
 (1,
  '0.012*"ubject" + 0.011*"ines" + 0.009*"rticle" + 0.008*"ear" + 0.006*"ime" + 0.006*"niversity" + 0.005*"cience" + 0.005*"ame" + 0.005*"mage" + 0.004*"omputer"'),
 (2,
  '0.014*"ubject" + 0.011*"ines" + 0.009*"rticle" + 0.009*"ame" + 0.008*"niversity" + 0.006*"ear" + 0.005*"mage" + 0.005*"eople" + 0.004*"ime" + 0.004*"eam"')]

# step 6 : 토픽 시각화

In [40]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, doc_term_matrix, dictionary)
vis