In [1]:
from tqdm import tqdm_notebook 
from konlpy.tag import * 
import MeCab
import string 
import warnings


from gensim import corpora
from gensim import models

import numpy as np
import re
import pickle
import matplotlib.pyplot as plt

%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning)

mecab = MeCab.Tagger()



In [2]:
def mecab_nouns(text):
    nouns = []
    
    pattern = re.compile('.*\t[A-Z]+')
    
    temp =[tuple(pattern.match(token).group(0).split('\t')) for token in mecab.parse(text).splitlines()[:-1]]  
    for token in temp:
        if token[1] == 'NNG' or token[1] == 'NNP' or token[1] == 'NNB' or token[1] == 'NNBC' or token[1] == 'NP' or token[1] == 'NR':
            nouns.append(token[0])

    return nouns

def mecab_morphs(text):
    morphs = []
    
    pattern = re.compile('.*\t[A-Z]+')

    temp =[tuple(pattern.match(token).group(0).split('\t')) for token in mecab.parse(text).splitlines()[:-1]]  
        
    for token in temp:
        morphs.append(token[0])

    return morphs

def mecab_pos(text):
    pos = []
    pattern = re.compile('.*\t[A-Z]+')
    pos = [tuple(pattern.match(token).group(0).split('\t')) for token in mecab.parse(text).splitlines()[:-1]]
        
    return pos

In [3]:
def read_documents(input_file_name):
    
    corpus = []
    
    with open(input_file_name, 'rb') as f: 
        temp_corpus = pickle.load(f)
    
    for page in temp_corpus: 
        corpus += page 
    
    return corpus

def text_cleaning(docs):
    cleaned_docs = []
    for doc in docs:
        temp_doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        cleaned_docs.append(temp_doc)

    return cleaned_docs

def define_stopwords(path):
    
    SW = set()
    for i in string.punctuation:
        SW.add(i)
    
    with open(path, 'r', encoding='utf-8') as f:
        for word in f:
            SW.add(word)

    return SW


def text_tokenizing(corpus, tokenizer):
    token_corpus = []
    if tokenizer == 'noun':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = mecab_nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)

    elif tokenizer == 'morph':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = mecab_morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)
    
    elif tokenizer == 'word':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)    

    return token_corpus

In [8]:
# 수정
import pickle

with open('naver_news_title.pk', 'rb') as f:
    data = pickle.load(f)

# print(data)
type(data)

list

In [None]:
# 수정
with open('naver_blog_content.pk', 'wb') as f:
    pickle.dump(naver_news_title, f)

In [9]:
input_file_name = "./naver_blog_content.txt"
documents = read_documents(input_file_name)
SW = define_stopwords("f:/data/stopwords-ko.txt")
cleaned_text = text_cleaning(documents)
tokenized_text = text_tokenizing(cleaned_text, tokenizer="morph")

UnpicklingError: invalid load key, '\xec'.

In [None]:
print(tokenized_text[3])

['본문', '내용', '플레이어', '플레이어', '오류', '우회', '위한', '함수', '추가', '부터', '까지', '이틀', '화상', '정상', '회의', '진행', '예정', '역내', '백신', '보급', '지원', '강화', '대응', '기금', '올해', '추가', '기여', '아세안', '정상', '대통령', '남방', '정책', '아세안', '실질', '협력', '추진', '높이', '평가', '대통령', '한반도', '완전', '비핵화', '항구', '평화', '정착', '위한', '아세안', '지지', '요청', '문재', '대통령', '청와대', '충무', '에서', '열린', '아세안', '화상', '정상', '회의', '에서', '기념', '촬영', '사진', '연합뉴스', '문재인', '대통령', '오후', '화상', '으로', '개최', '아세안', '정상', '회의', '참석', '이틀', '예정', '아세안', '관련', '정상', '회의', '일정', '시작', '면서', '한국', '아세안', '친구', '로서', '코로나', '함께', '극복', '포용', '지속', '가능', '미래', '함께', '만들', '나갈', '강조', '이날', '회의', '에서', '대통령', '아세안', '정상', '지난', '남방', '정책', '협력', '성과', '종합', '점검', '코로나', '위기', '보다', '미래', '함께', '만들', '나가', '위한', '아세안', '협력', '방향', '대해', '논의', '대통령', '모두', '발언', '에서', '한국', '아세안', '신뢰', '파트너', '로서', '아세안', '함께', '위기', '극복', '포용', '지속', '가능', '미래', '만들', '다는', '의지', '강조', '특히', '대통령', '우리', '나라', '글로벌', '백신', '허브', '으로서', '역내', '백신', '보급', '지원', '강화', '시켜', '나갈', '예정', 

In [None]:
dictionary = corpora.Dictionary(tokenized_text)
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

In [None]:
print(dictionary)

Dictionary(8550 unique tokens: ['가능', '가장', '각국', '감사', '강조']...)


In [None]:
corpus[2]

corpus[2][:5]

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 2)]

In [None]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[2][:5]

[(0, 0.018916727108076122),
 (1, 0.020354589699820753),
 (2, 0.031224685165735507),
 (3, 0.029297937295090025),
 (4, 0.03827406104350658)]

In [None]:
model = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary)

In [None]:
model.show_topic(0, 10)

[('접종', 0.020051062),
 ('으로', 0.019967519),
 ('백신', 0.019095084),
 ('에서', 0.015767338),
 ('코로나', 0.012068896),
 ('분기', 0.009438207),
 ('내용', 0.008693252),
 ('플레이어', 0.0074932147),
 ('본문', 0.0071215406),
 ('추가', 0.0057802196)]

In [None]:
# 토픽 개수, 키워드 개수를 정해주는 변수를 추가.
NUM_TOPICS = 4

NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    # 문서-단어 행렬 만들어주는 함수.
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]

    return corpus, dictionary


def print_topic_words(model): # model = LDA된 결과 

    # 토픽 모델링 결과를 출력해 주는 함수.
    print("\nPrinting topic words.\n")

    for topic_id in range(model.num_topics): 
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print('Topic ID: {}'.format(topic_id))

        for topic_word, prob in topic_word_probs:
            print('\t{}\t{}'.format(topic_word, prob))

        print('\n')

# document-term matrix를 만들고,
corpus, dictionary = build_doc_term_mat(tokenized_text)
# LDA를 실행.
model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha='auto', eta='auto')
# 결과를 출력.
print_topic_words(model)

Building document-term matrix.

Printing topic words.

Topic ID: 0
	접종	0.02667245641350746
	백신	0.02227579988539219
	에서	0.015130601823329926
	으로	0.014787590131163597
	코로나	0.013493729755282402
	확진	0.008282027207314968
	본문	0.0081480098888278
	내용	0.007253016345202923
	추가	0.0065884944051504135
	위한	0.006515545770525932
	플레이어	0.006273551378399134
	국내	0.0058905137702822685
	지난	0.004606188740581274
	다고	0.004552157130092382
	함수	0.004400155507028103
	모더	0.004336241632699966
	생산	0.004326247610151768
	완료	0.0042564296163618565
	부터	0.004146397113800049
	까지	0.0041446140967309475
	습니다	0.003918240778148174
	시설	0.0039043817669153214
	오류	0.003830363042652607
	정부	0.0037486127112060785
	일상	0.003717771265655756
	도입	0.003627997590228915
	감염	0.0032500941306352615
	우회	0.003192411968484521
	면서	0.0031096315942704678
	분기	0.003056736895814538


Topic ID: 1
	접종	0.01758766546845436
	코로나	0.016824405640363693
	으로	0.013582788407802582
	에서	0.01186371874064207
	백신	0.01169577706605196
	내용	0.007411159574985504
	다고	0.0068349

In [None]:
# pyLDAvis 불러오기
import pyLDAvis
import pyLDAvis.gensim
# pyLDAvis를 jupyter notebook에서 실행할 수 있게 활성화.
pyLDAvis.enable_notebook()

# pyLDAvis 실행.
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data # print X 그냥 실행