In [1]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from nltk import tokenize
import matplotlib as mlp
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from datetime import datetime
from textwrap import dedent
import pickle

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
# import umap.plot
from umap import UMAP
from hdbscan import HDBSCAN
# from mecab import MeCab
from konlpy.tag import Mecab
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
# from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance

# from youtube_helper import YouTubeHelper

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
pd.set_option('display.max_colwidth', 100)


In [3]:
DATA_DIR = Path.cwd() / 'content'
MODEL_DIR = Path.cwd() / 'models'

VERSION = "v2-morphs"

In [5]:
model_df = pd.read_csv(DATA_DIR / f'model_df_{VERSION}.csv')
model_df.published_at = pd.to_datetime(model_df.published_at)

In [6]:
model_df.head(1)

Unnamed: 0,published_at,updated_at,video_id,comment_id,parent_id,is_top_level_comment,comment_text,author_display_name,author_channel_id,like_count,comment_clean,Document,Topic,Name,CustomName,Top_n_words,Probability,Representative_document
0,2022-03-11 15:16:58+00:00,2022-03-11 15:16:58+00:00,aUrdB_Awn3w,UgwoEwbD29y57EbuTut4AaABAg,,True,독자개발아니면핵고유 핵우산은 소 용헚 없음,주나라,UCctlWJYESr_1n6uN5iKOvKw,0,독자개발아니면핵고유 핵우산은 소 용헚 없음,독자개발아니면핵고유 핵우산은 소 용헚 없음,2,2_핵무기_무장_개발_만들,"핵무기, 무장, 개발, 만들, 미사일, 해야, 잠수함, 무기, 필요, 기술, 가능, 나라, 원자력, 핵폭탄, 원전, 핵우산, 보다, 사용, 핵실험, 생각",핵무기 - 무장 - 개발 - 만들 - 미사일 - 해야 - 잠수함 - 무기 - 필요 - 기술 - 가능 - 나라 - 원자력 - 핵폭탄 - 원전 - 핵우산 - 보다 - 사용 - ...,0.535131,False


In [11]:
topic1 = model_df[model_df.Topic == 1]
topic2 = model_df[model_df.Topic == 2]

print(f"Topic 1: {len(topic1)}")
print(f"Topic 2: {len(topic2)}")

Topic 1: 49139
Topic 2: 32366


In [8]:
docs = topic2['Document'].tolist()

In [9]:
sentence_transformer_model = 'paraphrase-multilingual-mpnet-base-v2'
sentence_model = SentenceTransformer(sentence_transformer_model, device='mps')

In [10]:
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/1012 [00:00<?, ?it/s]

In [12]:
class KoreanTokenizer:
    """
    
    Mecab:
    https://github.com/jonghwanhyeon/python-mecab-ko
    https://www.lesbonscomptes.com/recoll/pages/recoll-korean.html
    """
    def __init__(self, tagger, stop_words):
        self.tagger = tagger
        self.stop_words = stop_words
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.morphs(sent)
        # word_tokens = self.tagger.nouns(sent)
        results = [word for word in word_tokens if len(word) > 1 and word not in self.stop_words]
        return results

In [None]:
"""
topic 1: nn=, nc=, md=, mcs=, ms=, csm='eom'
"""

In [17]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42,
)

hdbscan_model = HDBSCAN(
    min_cluster_size=100,
    min_samples=15,
    metric='euclidean',
    cluster_selection_method='eom', # 'leaf' or 'eom'
    prediction_data=True,
)

# vectorizer_model = CountVectorizer(
#     stop_words='english',
#     ngram_range=(1, 2),
# )

# https://github.com/stopwords-iso/stopwords-iso
with open(DATA_DIR / 'kr_stopwords.txt', 'r') as f:
    stop_words = f.read().splitlines()

custom_tokenizer = KoreanTokenizer(
    Mecab(),
    stop_words=stop_words,
)


vectorizer_model = CountVectorizer(
    tokenizer=custom_tokenizer,
)

mmr_model = MaximalMarginalRelevance(
    diversity=0.5,
    top_n_words=20,
)

ctfidf_model = ClassTfidfTransformer(
    bm25_weighting=True,
    reduce_frequent_words=True,
)

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=mmr_model,
    ctfidf_model=ctfidf_model,
    top_n_words=20,
    calculate_probabilities=True,
    verbose=True,
)

In [18]:
topics, probs = topic_model.fit_transform(docs, embeddings) 

2023-05-05 15:09:20,054 - BERTopic - Reduced dimensionality
2023-05-05 15:09:22,086 - BERTopic - Clustered reduced embeddings


In [19]:
topic_labels = topic_model.generate_topic_labels(nr_words=10, topic_prefix=False, separator=', ')
topic_model.set_topic_labels(topic_labels)

topics_df = topic_model.get_topic_info()
print(len(topics_df))
topics_df.head(20)

12


Unnamed: 0,Topic,Count,Name,CustomName
0,-1,1995,-1_sm_미사일_지진_비밀리,"sm, 미사일, 지진, 비밀리, km, 개발, 화성, 가능, 무기, 기술"
1,0,26020,0_핵무기_보유_무장_해야,"핵무기, 보유, 무장, 해야, 미국, 만들, 전쟁, 국가, 개발, 생각"
2,1,1248,1_미사일_사거리_무기_발사,"미사일, 사거리, 무기, 발사, 방어, 현무, 제한, 극초음속, km, 순항"
3,2,1083,2_잠수함_추진_디젤_항모,"잠수함, 추진, 디젤, 항모, 원자력, 해군, 건조, 바다, 항공모함, 원자로"
4,3,743,3_드론_헬기_35_비행,"드론, 헬기, 35, 비행, 전투기, 엔진, kf, 비행기, 항공모함, 프로펠러"
5,4,263,4_위성_외계인_로켓_발사체,"위성, 외계인, 로켓, 발사체, 궤도, st, 성공, site, 우주선, 34"
6,5,228,5_찬성_지지_적극_무장,"찬성, 지지, 적극, 무장, 응원, 강력, 독자, 오세훈, 70, 국민"
7,6,187,6_icbm_핵탄두_탄두_기술,"icbm, 핵탄두, 탄두, 기술, 발사, 발사체, 진입, 운반체, 미사일, 완성"
8,7,176,7_트럼프_미군_철수_대통령,"트럼프, 미군, 철수, 대통령, 재선, 달러, 주둔, 요구, trump, 분담금"
9,8,174,8_비핵화_선언_종전_멈춰,"비핵화, 선언, 종전, 멈춰, 현실, 타령, 소리, 속아, 비무장, 포기"


In [20]:
docs_fig = topic_model.visualize_documents(docs, embeddings=embeddings) # , topics=range(50)
docs_fig.show()