In [1]:
from tqdm import tqdm_notebook 
from konlpy.tag import * 
import MeCab
import string 
import warnings


from gensim import corpora
from gensim import models

import numpy as np
import re
import pickle
import matplotlib.pyplot as plt

%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning)

mecab = MeCab.Tagger()



In [2]:
def mecab_nouns(text):
    nouns = []
    
    pattern = re.compile('.*\t[A-Z]+')
    
    temp =[tuple(pattern.match(token).group(0).split('\t')) for token in mecab.parse(text).splitlines()[:-1]]  
    for token in temp:
        if token[1] == 'NNG' or token[1] == 'NNP' or token[1] == 'NNB' or token[1] == 'NNBC' or token[1] == 'NP' or token[1] == 'NR':
            nouns.append(token[0])

    return nouns

def mecab_morphs(text):
    morphs = []
    
    pattern = re.compile('.*\t[A-Z]+')

    temp =[tuple(pattern.match(token).group(0).split('\t')) for token in mecab.parse(text).splitlines()[:-1]]  
        
    for token in temp:
        morphs.append(token[0])

    return morphs

def mecab_pos(text):
    pos = []
    pattern = re.compile('.*\t[A-Z]+')
    pos = [tuple(pattern.match(token).group(0).split('\t')) for token in mecab.parse(text).splitlines()[:-1]]
        
    return pos

In [3]:
def read_documents(input_file_name):
    
    corpus = []
    
    with open(input_file_name, 'rb') as f: 
        temp_corpus = pickle.load(f)
    
    for page in temp_corpus: 
        corpus += page 
    
    return corpus

def text_cleaning(docs):
    cleaned_docs = []
    for doc in docs:
        temp_doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        cleaned_docs.append(temp_doc)

    return cleaned_docs

def define_stopwords(path):
    
    SW = set()
    for i in string.punctuation:
        SW.add(i)
    
    with open(path, 'r', encoding='utf-8') as f:
        for word in f:
            SW.add(word)

    return SW


def text_tokenizing(corpus, tokenizer):
    token_corpus = []
    if tokenizer == 'noun':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = mecab_nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)

    elif tokenizer == 'morph':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = mecab_morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)
    
    elif tokenizer == 'word':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)    

    return token_corpus

In [4]:
input_file_name = "./naver_news_content.pk"
documents = read_documents(input_file_name)
SW = define_stopwords("f:/data/stopwords-ko.txt")
cleaned_text = text_cleaning(documents)
tokenized_text = text_tokenizing(cleaned_text, tokenizer="morph")

Preprocessing:   0%|          | 0/522 [00:00<?, ?it/s]

In [5]:
print(tokenized_text[3])

['본문', '내용', '플레이어', '플레이어', '오류', '우회', '위한', '함수', '추가', '부터', '까지', '이틀', '화상', '정상', '회의', '진행', '예정', '역내', '백신', '보급', '지원', '강화', '대응', '기금', '올해', '추가', '기여', '아세안', '정상', '대통령', '남방', '정책', '아세안', '실질', '협력', '추진', '높이', '평가', '대통령', '한반도', '완전', '비핵화', '항구', '평화', '정착', '위한', '아세안', '지지', '요청', '문재', '대통령', '청와대', '충무', '에서', '열린', '아세안', '화상', '정상', '회의', '에서', '기념', '촬영', '사진', '연합뉴스', '문재인', '대통령', '오후', '화상', '으로', '개최', '아세안', '정상', '회의', '참석', '이틀', '예정', '아세안', '관련', '정상', '회의', '일정', '시작', '면서', '한국', '아세안', '친구', '로서', '코로나', '함께', '극복', '포용', '지속', '가능', '미래', '함께', '만들', '나갈', '강조', '이날', '회의', '에서', '대통령', '아세안', '정상', '지난', '남방', '정책', '협력', '성과', '종합', '점검', '코로나', '위기', '보다', '미래', '함께', '만들', '나가', '위한', '아세안', '협력', '방향', '대해', '논의', '대통령', '모두', '발언', '에서', '한국', '아세안', '신뢰', '파트너', '로서', '아세안', '함께', '위기', '극복', '포용', '지속', '가능', '미래', '만들', '다는', '의지', '강조', '특히', '대통령', '우리', '나라', '글로벌', '백신', '허브', '으로서', '역내', '백신', '보급', '지원', '강화', '시켜', '나갈', '예정', 

In [6]:
dictionary = corpora.Dictionary(tokenized_text)
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

In [7]:
print(dictionary)

Dictionary(8550 unique tokens: ['가능', '가장', '각국', '감사', '강조']...)


In [8]:
corpus[2]

corpus[2][:5]

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 2)]

In [9]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[2][:5]

[(0, 0.018916727108076122),
 (1, 0.020354589699820753),
 (2, 0.031224685165735507),
 (3, 0.029297937295090025),
 (4, 0.03827406104350658)]

In [10]:
# corpus -> corpus_tfidf
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=3, id2word=dictionary)

In [11]:
model.show_topic(0, 10)

[('분기', 0.0027425492),
 ('생산', 0.0023713612),
 ('모더', 0.0015388113),
 ('국내', 0.0014743245),
 ('판매', 0.0014681031),
 ('수출', 0.0014547467),
 ('성장', 0.0014490802),
 ('시설', 0.0013182973),
 ('공급', 0.0012509981),
 ('패스', 0.0012350699)]

In [12]:
topics = model.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.003*"분기" + 0.002*"생산" + 0.002*"모더"')
(1, '0.002*"습니다" + 0.002*"확진" + 0.002*"생산"')
(2, '0.002*"생산" + 0.002*"모더" + 0.001*"바이오"')


In [13]:
# 토픽 개수, 키워드 개수를 정해주는 변수를 추가.
NUM_TOPICS = 3

NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    # 문서-단어 행렬 만들어주는 함수.
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]

    return corpus, dictionary


def print_topic_words(model): # model = LDA된 결과 

    # 토픽 모델링 결과를 출력해 주는 함수.
    print("\nPrinting topic words.\n")

    for topic_id in range(model.num_topics): 
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print('Topic ID: {}'.format(topic_id))

        for topic_word, prob in topic_word_probs:
            print('\t{}\t{}'.format(topic_word, prob))

        print('\n')

# document-term matrix를 만들고,
corpus, dictionary = build_doc_term_mat(tokenized_text)
# LDA를 실행.
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=NUM_TOPICS, id2word=dictionary, alpha='auto', eta='auto')
# 결과를 출력.
print_topic_words(model)

Building document-term matrix.

Printing topic words.

Topic ID: 0
	생산	0.002357980003580451
	모더	0.002135324524715543
	국내	0.0017767532262951136
	바이오	0.0017229878576472402
	삼성	0.0015069014625623822
	센터	0.0015039705904200673
	로직스	0.0014280029572546482
	접종	0.0014229068765416741
	시설	0.0012261108495295048
	도입	0.0012235685717314482
	회분	0.0012074230471625924
	패스	0.001187679823487997
	서울	0.0011673232074826956
	확진	0.0011582880979403853
	공급	0.0010989075526595116
	음성	0.0010847816010937095
	예방	0.0010415466967970133
	검사	0.0009919404983520508
	습니다	0.0009625070379115641
	체육	0.0009584961226209998
	완료	0.0009470569202676415
	화이자	0.0009043641039170325
	위탁	0.0008863279945217073
	구민	0.0008626210619695485
	일상	0.0008500915137119591
	인구	0.0008473736816085875
	시민	0.0008436162024736404
	마포	0.000839454703964293
	으로	0.0008309524855576456
	물량	0.0008304264047183096


Topic ID: 1
	생산	0.0028560312930494547
	습니다	0.002089386573061347
	모더	0.002066203858703375
	바이오	0.0015934871044009924
	로직스	0.0013308562338352203
	삼성	0.00

In [14]:
# pyLDAvis 불러오기
import pyLDAvis
import pyLDAvis.gensim
# pyLDAvis를 jupyter notebook에서 실행할 수 있게 활성화.
pyLDAvis.enable_notebook()

# pyLDAvis 실행.
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data # print X 그냥 실행