## 라이브러리 불러오기

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
import pickle
from kiwipiepy import Kiwi
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models
import openpyxl
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

## 데이터 불러오기

In [None]:
data_path = './data/뉴스_크롤링.xlsx'
df = pd.read_excel(data_path, index_col=False)

In [None]:
df

## 사용할 토크나이저 생성
- 한국어 형태소분석기인 kiwi 사용

In [None]:
class MyTokenizer:
    def __init__(self, kiwi):
        self.kiwi = kiwi
    def __call__(self, text):
        result = list()
        for token in self.kiwi.tokenize(text):
            if token[1] in ["NNG", "NNP", "NNB", "NR", "NP"] and int(token[3]) > 1:
                result.append(token[0])
        return result
    
mytokenizer = MyTokenizer(Kiwi())

명사 추출

In [None]:
df['content2nouns'] = df['Content'].apply(lambda x: mytokenizer(x))

In [None]:
df

## 딕셔너리 생성

In [None]:
df['content2nouns'] = df['content2nouns'].apply(lambda x: ', '.join(x).replace(',', ''))

In [None]:
text_data = list(df['content2nouns'].apply(lambda x: x.split( )))

In [None]:
# 토픽 모델링 딕셔너리 생성
id2word = corpora.Dictionary(text_data)

# 토픽모델링에 사용할 말뭉치 생성
texts = text_data

# 용어-문서 빈도
corpus = [id2word.doc2bow(text) for text in texts]

## topic coherence score 확인

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus,id2word=id2word,num_topics=num_topics,random_state=100,update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=text_data, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
# 시작할 최소 토픽 개수
start=1
# 최대 한계
limit=15
# 단계별 증가시킬 토픽 수
step=1

model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=text_data, start=start, limit=limit, step=step)

# 토픽의 개수별 Coherence Score 스코어 확인
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# coherence scores 출력
i=0
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4), "model_number : {}".format(i))
    i+=1

average topic coherence = 전체 topic의 topic coherences를 더한 값을 topic 수로 나눈 값

In [None]:
for m in model_list:
    top_topics = m.top_topics(corpus)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / 15
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    print(top_topics)
    break

## lda config

In [None]:
num_topics = 12
topic_word_num = 10
seed = 42
update_every = 1
chunksize = 100
passes = 10

In [None]:
#모델 정의
lda_model = LdaModel(corpus=corpus, 
                     id2word=id2word,
                     num_topics=num_topics,
                     random_state=seed,
                     update_every=update_every,
                     chunksize=chunksize,
                     passes=passes,
                     alpha='auto',
                     per_word_topics=True)

# 토픽 출력
print(lda_model.print_topics(num_words=topic_word_num))
doc_lda = lda_model[corpus]

# 모델 저장 
lda_model.save('./lda_results/news_lda_topic_modeling.lda')

In [None]:
list_word = []
for topic_id in range(num_topics):
    topic_word_probs = lda_model.show_topic(topic_id, topic_word_num)
    for topic_word in topic_word_probs:
        list_word.append(topic_word[0])

In [None]:
list_word_set = list(set(list_word))
series_word = pd.Series(list_word_set, name = 'keywords')

In [None]:
series_word.to_excel('./lda_results/news_data_topic_word.xlsx', index=False)

In [None]:
lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False, n_jobs=1)
pyLDAvis.save_html(lda_visualization, 'lda_result_vis.html')