## 라이브러리 불러오기

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
import pickle
from kiwipiepy import Kiwi
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models
import openpyxl
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

## 데이터 불러오기

In [None]:
data_path = './data/뉴스_크롤링.xlsx'
df = pd.read_excel(data_path, index_col=False)

In [None]:
df

## 사용할 토크나이저 생성
- 한국어 형태소분석기인 kiwi 사용

In [None]:
class MyTokenizer:
    def __init__(self, kiwi):
        self.kiwi = kiwi
    def __call__(self, text):
        result = list()
        for token in self.kiwi.tokenize(text):
            if token[1] in ["NNG", "NNP", "NNB", "NR", "NP"] and int(token[3]) > 1:
                result.append(token[0])
        return result
    
mytokenizer = MyTokenizer(Kiwi())

명사 추출

In [None]:
df['content2nouns'] = df['Content'].apply(lambda x: mytokenizer(x))

In [None]:
df

## 딕셔너리 생성

In [None]:
df['content2nouns'] = df['content2nouns'].apply(lambda x: ', '.join(x).replace(',', ''))

In [None]:
text_data = list(df['content2nouns'].apply(lambda x: x.split( )))

In [None]:
text_data

In [None]:
# LDA와 내부에서 받는 자료형태로 변환
dictionary = corpora.Dictionary(text_data)

# doc2bow를 이용해 코퍼스 생성
corpus = [dictionary.doc2bow(text) for text in text_data]

## toic perplexity 확인
- 혼란도
- 모델이 얼마나 정확하게 예측하는지를 확인함
- 낮을수록 정확하게 예측함

In [None]:
def compute_perplexity(dictionary, corpus, start, limit, step):
    perplexity_values = []

    for i in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary, random_state=42)
        perplexity_values.append(model.log_perplexity(corpus))
    
    return perplexity_values

In [None]:
# 시작할 최소 토픽 개수
start=1
# 최대 한계
limit=15
# 단계별 증가시킬 토픽 수
step=1

perplexity_values = compute_perplexity(dictionary=dictionary, corpus=corpus, start=start, limit=limit, step=step)

# 토픽의 개수별 perplexity 확인
x = range(start, limit, step)
plt.plot(x, perplexity_values)
plt.xlabel("Number Of Topics")
plt.ylabel("Perplexity score")
plt.show()

# perplexity scores 출력
i=0
for n, pv in enumerate(perplexity_values):
    print(f"Number Of Topics =", n, " has perplexity Value of", round(pv, 4))
    i+=1

## topic coherence score 확인
- 토픽이 얼마나 일관성 있는지를 판단
- 높을수록 의미론적 일관성이 있음 

In [None]:
def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    coherence_values = []
    
    for i in range(start, limit, step):
        # model = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary, random_state=42, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
        model = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary, random_state=42)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model.get_coherence()
        coherence_values.append(coherence_lda)

    return coherence_values

In [None]:
# 시작할 최소 토픽 개수
start=1
# 최대 한계
limit=15
# 단계별 증가시킬 토픽 수
step=1

coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=text_data, start=start, limit=limit, step=step)

# 토픽의 개수별 Coherence Score 스코어 확인
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Number Of Topics")
plt.ylabel("Coherence score")
plt.show()

# coherence scores 출력
i=0
for m, cv in enumerate(coherence_values):
    print("Number Of Topics =", m, " has Coherence Value of", round(cv, 4))
    i+=1

## lda config

In [None]:
num_topics = 14
topic_word_num = 10
seed = 42
update_every = 1
chunksize = 100
passes = 10

In [None]:
#모델 정의
lda_model = LdaModel(corpus=corpus, 
                     id2word=dictionary,
                     num_topics=num_topics,
                     random_state=seed,
                     )

# 토픽 출력
print(lda_model.print_topics(num_words=topic_word_num))

# 모델 저장 
lda_model.save('./lda_results/news_lda_topic_modeling.lda')

In [None]:
list_word = []
for topic_id in range(num_topics):
    topic_word_probs = lda_model.show_topic(topic_id, topic_word_num)
    for topic_word in topic_word_probs:
        list_word.append(topic_word[0])

In [None]:
list_word_set = list(set(list_word))
series_word = pd.Series(list_word_set, name = 'keywords')

In [None]:
series_word.to_excel('./lda_results/news_data_topic_word.xlsx', index=False)

In [None]:
lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, sort_topics=False, n_jobs=1)
pyLDAvis.save_html(lda_visualization, 'lda_result_vis.html')