## 라이브러리 불러오기

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from kiwipiepy import Kiwi
from bertopic import BERTopic
from umap import UMAP
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

## 데이터 불러오기

In [None]:
data_path = './data/뉴스_크롤링.xlsx'
df = pd.read_excel(data_path, index_col=False)

In [None]:
df

## 사용할 토크나이저 생성
- 한국어 형태소분석기인 kiwi 사용

In [None]:
class MyTokenizer:
    def __init__(self, kiwi):
        self.kiwi = kiwi
    def __call__(self, text):
        result = list()
        for token in self.kiwi.tokenize(text):
            if token[1] in ["NNG", "NNP", "NNB", "NR", "NP"] and int(token[3]) > 1:
                result.append(token[0])
        return result
    
mytokenizer = MyTokenizer(Kiwi())

## 단어 행렬 생성
- 불용어 불러오기

In [None]:
with open('./stopwords.txt', encoding= 'utf-8') as f:
    stopwords = []
    for w in f:
        stopwords.append(w.strip())

vectorizer =  CountVectorizer(tokenizer=mytokenizer, max_features=300, stop_words=stopwords)

## Bertopic parameters
- https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=5,
                  min_dist=0.0, metric='cosine', random_state=42)
model = BERTopic(embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
                 umap_model=umap_model,
                 vectorizer_model=vectorizer,
                 nr_topics=11,
                 top_n_words=10,
                 calculate_probabilities=True)

## BERTopic 실행
- fit: Bert, UMAP, HDBSCAN를 주어진 document에 맞추기
- fit_transform: 모델 맞추기, 토픽 생성, 문서별 확률 및 토픽 반환을 한번에 진행

In [None]:
topics, probs = model.fit_transform(df['Content'])

## 토픽별 단어 확인

In [None]:
model.get_topic(0)

## 전체 토픽 확인

In [None]:
model.get_topic_info()

## 토픽 병합

In [None]:
topics_to_merge = [3, 4]
model.merge_topics(df['Content'], topics_to_merge)

In [None]:
model.get_topic_info()

## 토픽 클러스터링 시각화

In [None]:
model.visualize_topics(topics=list(range(9)))

In [None]:
for i in range(9):
    print(i,'번째 토픽 :', model.get_topic(i))

In [None]:
model.visualize_heatmap(topics=list(range(9)))

# 다이나믹 토픽 모델링 시계열 변환 (데이터 프레임)

In [None]:
timestamps = df['Date'].to_list()
content_text = df['Content'].to_list()

topics_over_time = model.topics_over_time(content_text, timestamps)
topics_over_time

# 직접 시각화 with plotly

In [None]:
import plotly.express as px

fig = px.line(topics_over_time, x="Timestamp", y="Frequency", title='Topic Frequency Over Time', color="Topic")
fig.show()

In [None]:
import plotly.express as px


fig = px.line(topics_over_time[topics_over_time['Topic']==0], x="Timestamp", y="Frequency", title='Topic Frequency Over Time')
fig.show()

# BERTopic 라이브러리를 활용한 시각화

In [None]:
model.visualize_topics_over_time(topics_over_time, topics=list(range(9)))

In [None]:
model.visualize_topics_over_time(topics_over_time, topics=[0])

## 워드클라우드 생성

In [None]:
def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000, font_path="C:/Windows/Fonts/malgun.ttf")
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

create_wordcloud(model, topic=5)

## 문서 예측

In [None]:
with open('./data/new_docs.txt', encoding= 'utf-8') as f:
    test_doc = []
    for w in f:
        test_doc.append(w.strip())

print(test_doc)

In [None]:
topics, probs = model.transform([test_doc])
print('예측한 토픽 번호 :', topics)

## 모델 저장

In [None]:
model.save('bertopic_results', serialization='safetensors')

## 불러오기

In [None]:
BerTopic_model = BERTopic.load("bertopic_results")