# 토픽모델링 - BERTopic

## 1. 데이터 로드

In [1]:
# 데이터 로드
from datasets import load_dataset
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

In [2]:
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]

## 2. 모델 로드

### 2.1 임베딩 모델

In [3]:
from sentence_transformers import SentenceTransformer

#초록에 대한 임베딩 생성
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar = True)
embeddings.shape

Batches:   0%|          | 0/1405 [00:00<?, ?it/s]

(44949, 384)

### 2.2 차원축소 모델(UMAP)

In [5]:
from umap import UMAP

# 입력 임베딩 차원축소 (384-> 5)
umap_model = UMAP(
                    n_components = 5, 
                    min_dist = 0.0, 
                    metric= 'cosine', 
                    random_state=42
                )
reduced_embeddings = umap_model.fit_transform(embeddings)
reduced_embeddings.shape

(44949, 5)

### 2.4 클러스터링 모델(DBSCAN)

In [6]:
from hdbscan import HDBSCAN

# 모델 훈련 > 클러스터 추출
hdbscan_model = HDBSCAN(min_cluster_size=50).fit(reduced_embeddings)
clusters = hdbscan_model.labels_

#클러스터 개수 확인
len(set(clusters))

147

## 3. BERTopic 모델 훈련

In [7]:
from bertopic import BERTopic

# 임베딩모델로 BERTopic 훈련
topic_model = BERTopic(
                        embedding_model=embedding_model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model,
                        verbose=True
                        ).fit(abstracts, embeddings)

2025-08-25 22:08:16,406 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-25 22:08:57,969 - BERTopic - Dimensionality - Completed ✓
2025-08-25 22:08:57,971 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-25 22:08:59,206 - BERTopic - Cluster - Completed ✓
2025-08-25 22:08:59,217 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-25 22:09:03,296 - BERTopic - Representation - Completed ✓


## 4. 결과확인

### 4.1 토픽 결과 확인

In [8]:
# 토픽에 대한 정보 제공
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,14145,-1_of_the_and_to,"[of, the, and, to, in, we, language, that, for...",[ Existing NLP datasets contain various biase...
1,0,2218,0_question_answer_questions_qa,"[question, answer, questions, qa, answering, a...","[ In recent years, there have been amazing ad..."
2,1,2077,1_speech_asr_recognition_end,"[speech, asr, recognition, end, acoustic, audi...",[ Recent works showed that end-to-end neural ...
3,2,1304,2_medical_clinical_biomedical_patient,"[medical, clinical, biomedical, patient, notes...",[ Medical entity retrieval is an integral com...
4,3,1012,3_translation_nmt_machine_bleu,"[translation, nmt, machine, bleu, neural, engl...","[ Many language pairs are low resource, meani..."
...,...,...,...,...,...
142,141,56,141_emoji_emojis_emoticons_sentiment,"[emoji, emojis, emoticons, sentiment, social, ...",[ The frequent use of Emojis on social media ...
143,142,54,142_typing_entity_type_types,"[typing, entity, type, types, grained, mention...",[ For the task of fine-grained entity typing ...
144,143,52,143_gans_gan_adversarial_generation,"[gans, gan, adversarial, generation, generativ...",[ Text generation is of particular interest i...
145,144,52,144_coherence_discourse_paragraph_text,"[coherence, discourse, paragraph, text, cohesi...",[ While there has been significant progress t...


### 4.2 특정 토픽 키워드 확인

In [9]:
# 토픽 키워드 확인
topic_model.get_topic(0)

[('question', 0.021790546294502645),
 ('answer', 0.016171550461421377),
 ('questions', 0.01609062266570959),
 ('qa', 0.016000456573199352),
 ('answering', 0.015081137402727372),
 ('answers', 0.009963533060617888),
 ('retrieval', 0.009703280888035352),
 ('comprehension', 0.0079384680212119),
 ('reading', 0.007328196366777005),
 ('the', 0.006546302696243955)]

In [10]:
# 토픽 키워드 확인
topic_model.get_topic(1)

[('speech', 0.029683938280848986),
 ('asr', 0.019766529959028785),
 ('recognition', 0.014106982078333514),
 ('end', 0.010764116888865593),
 ('acoustic', 0.009866149234395665),
 ('audio', 0.007097298049809765),
 ('speaker', 0.00694324796948636),
 ('wer', 0.006708450666420824),
 ('error', 0.006708123730004558),
 ('the', 0.006685266944856871)]

### 4.3 키워드로 토픽검색

In [11]:
topic_model.find_topics("topic modeling")

([27, -1, 45, 2, 115], [0.9546319, 0.9115192, 0.9070977, 0.9059248, 0.9050997])

In [12]:
topic_model.get_topic(27)

[('topic', 0.06873965291097482),
 ('topics', 0.036079472354503796),
 ('lda', 0.01667547419622479),
 ('latent', 0.013637437528202095),
 ('document', 0.012536231748955642),
 ('modeling', 0.012510482995139896),
 ('documents', 0.011908075346549758),
 ('dirichlet', 0.010059717369534853),
 ('word', 0.008870163999198342),
 ('allocation', 0.007838205821662697)]

In [13]:
# Bertopic 초록 할당되었는지 확인해보기
topic_model.topics_[titles.index("BERTopic: Neural topic modeling with a class-based TF-IDF procedure")]

27

## 5. 토픽문서 시각화

### 5.1 기본 시각화

In [23]:
fig = topic_model.visualize_documents(
                                    titles,
                                    reduced_embeddings=reduced_embeddings,
                                    width=1200,
                                    hide_annotations=True
                                    )

fig.update_layout(font=dict(size=16))

TypeError: Wrong key type: '9790' of type '<class 'numpy.int64'>'. Expected one of int, slice, range, str or Iterable.

### 5.2 키워드순 막대그래프

In [24]:
topic_model.visualize_barchart()

### 5.3 토픽 사이 관계 시각화

In [25]:
topic_model.visualize_heatmap(n_clusters=30)

### 5.4 토픽 사이 잠재적 계층구조 시각화

In [26]:
topic_model.visualize_hierarchy()

## 6. Reranker
- representaion model이라고도 하는 reranker


### 6.1 KeyBERT
- KeyBERT는 코사인유사도를 이용해 단어 임베딩과 문서 임베딩을 비교하여 텍스트에서 키워드 추출 후 순위 재조정

In [27]:
## 비교 할 수 있도록 세팅
from copy import deepcopy
original_topics = deepcopy(topic_model.topic_representations_)

In [None]:
import pandas as pd
def topic_differences(model, original_topics, nr_topics=5):
    "두 모델의 토픽 표현 차이 보여줌"
    df = pd.DataFrame(columns=["토픽", "원본", "업데이트"])
    for topic in range(nr_topics):

        # 모델과 토픽마다 상위5개 단어 추출
        og_words = " : ".join(list(zip(*original_topics[topic]))[0][:10])
        new_words = " : ".join(list(zip(*model.get_topic(topic)))[0][:10])
        df.loc[len(df)] = [topic, og_words, new_words]
    return df

In [None]:
from bertopic.representation import KeyBERTInspired
# KeyBERTINspired를 사용해 토픽 표현 업데이트
representation_model = KeyBERTInspired()
topic_model.update_topics(abstracts, representation_model=representation_model)

# 토픽 차이 보여줌
df_keybert = topic_differences(topic_model, original_topics)
df_keybert

Unnamed: 0,토픽,원본,업데이트
0,0,question : answer : questions : qa : answering,answering : questions : comprehension : retrie...
1,1,speech : asr : recognition : end : acoustic,phonetic : speech : transcription : language :...
2,2,medical : clinical : biomedical : patient : notes,nlp : clinical : language : text : medicine
3,3,translation : nmt : machine : bleu : neural,translation : translate : translations : multi...
4,4,summarization : summaries : summary : abstract...,summarization : summarizers : summaries : summ...


In [39]:
df_keybert.iloc[0]

토픽                                                      0
원본         question : answer : questions : qa : answering
업데이트    answering : questions : comprehension : retrie...
Name: 0, dtype: object

### 6.2 MMR
- MMR(maximal marginal relevance)
- TF-IDF, KeyBERT에 존재하는 중복된 키워드 최대한 제거 가능
- (예: summaries, summary, summarize -> summary)

In [None]:
from bertopic.representation import MaximalMarginalRelevance

# MMR 사용하여 토픽 표현 업데이트
representation_model = MaximalMarginalRelevance(diversity=0.2)
topic_model.update_topics(abstracts, representation_model=representation_model)

# 토픽 차이 시각화
topic_differences(topic_model, original_topics)

Unnamed: 0,토픽,원본,업데이트
0,0,question : answer : questions : qa : answering,qa : answering : retrieval : comprehension : k...
1,1,speech : asr : recognition : end : acoustic,speech : asr : audio : wer : model
2,2,medical : clinical : biomedical : patient : notes,medical : clinical : biomedical : patient : do...
3,3,translation : nmt : machine : bleu : neural,translation : nmt : bleu : neural : parallel
4,4,summarization : summaries : summary : abstract...,summarization : summaries : abstractive : docu...


## 7. 생성모델 활용한 교정

In [None]:
import openai
from bertopic.representation import OpenAI

prompt = """
I have a topic that contains the following documents:
[DOCUMENT]

The topic is described by the following keywords : [KEYWORDS]

Based on the information above, extract a short topic label in the following format :
topic : <short topic label>
"""

# GPT 사용하여 토픽표현 업데이트
client = openai.OpenAI(api_key = my_api_key)
representation_model = OpenAI(
    client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt
    )
topic_model.update_topics(abstracts, representation_model=representation_model) 

# 토픽차이 보여주기
topic_differences(topic_model, original_topics)