In [3]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from konlpy.tag import Okt

# Load the Excel file and read the 'Abstract' column
df = pd.read_excel('data.xlsx')
texts = df['Abstract'].dropna().tolist()

# Tokenize the texts using the Okt tokenizer
tokenizer = Okt()
texts = [tokenizer.morphs(text) for text in texts]

# Create a dictionary from the tokenized texts
dictionary = corpora.Dictionary(texts)

# Convert the tokenized texts into a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Find the optimal number of topics
start_topic = 2
end_topic = 10
step = 1
coherence_scores = []
for num_topics in range(start_topic, end_topic+1, step):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"Number of Topics: {num_topics}, Coherence Score: {coherence_score}")

# Select the optimal number of topics
optimal_num_topics = coherence_scores.index(max(coherence_scores)) + start_topic
print(f"Optimal Number of Topics: {optimal_num_topics}")

# Create the LDA model with the optimal number of topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=optimal_num_topics)


Number of Topics: 2, Coherence Score: 0.25394007732480595
Number of Topics: 3, Coherence Score: 0.24595476577780115
Number of Topics: 4, Coherence Score: 0.2435987095817201
Number of Topics: 5, Coherence Score: 0.24984490948461816
Number of Topics: 6, Coherence Score: 0.2624446239649353
Number of Topics: 7, Coherence Score: 0.2509098567758015
Number of Topics: 8, Coherence Score: 0.25006804443464536
Number of Topics: 9, Coherence Score: 0.251699726245702
Number of Topics: 10, Coherence Score: 0.25406617352396016
Optimal Number of Topics: 6


일관성 점수가 높은 모델은 주제 간의 일관성이 높아 의미 있는 토픽을 잘 나타내는 경향이 있습니다. 따라서 이를 기준으로 최적의 토픽 수를 선택할 수 있습니다.