In [None]:
"""BERTopic with Korean text
https://wikidocs.net/162079

"""

In [71]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from nltk import tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from textwrap import dedent
import pickle

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from mecab import MeCab
from sklearn.feature_extraction.text import CountVectorizer
# from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance

from youtube_helper import YouTubeHelper

In [2]:
DATA_DIR = Path.cwd() / 'content'
COMMENTS_DIR = DATA_DIR / 'video_comments'
SEARCH_DIR = DATA_DIR / 'video_search'

In [208]:
videos = pd.read_csv(DATA_DIR / 'video_details_20230501-0122.csv')

In [227]:
videos.sort_values(by='view_count', ascending=False).iloc[0]

published_at                                      2020-10-17T03:00:01Z
video_id                                                   KDKjmzSbryM
video_title                                 (ENG)북한이 한국에게 핵을 쏘면 생기는 일?
video_description    #미사일 #북한핵 #북한\n\n오늘은 미사일 전문가 멋진창창님과 함께\n\n북한의 ...
channel_id                                    UCoCvTlU0KpNYwnMIgs7MPrA
channel_title                                                  보다 BODA
tags                 ['씨랩', 'CLAB', '소셜', '리액션', 'Youtube reaction'...
category_id                                                         24
duration                                                      PT15M36S
view_count                                                     5766047
like_count                                                         NaN
comment_count                                                      NaN
regions_blocked                                                    NaN
regions_allowed                                                    NaN
conten

In [257]:
nuclear_keywords = ['핵무장', '핵개발', '보유']
nuclear_videos = videos[videos.video_title.str.contains('|'.join(nuclear_keywords)) | videos.video_description.str.contains('|'.join(nuclear_keywords))]
print(len(nuclear_videos))
nuclear_videos.head()

810


Unnamed: 0,published_at,video_id,video_title,video_description,channel_id,channel_title,tags,category_id,duration,view_count,like_count,comment_count,regions_blocked,regions_allowed,content_rating,etag,window_start,window_end,query,collection_date
0,2023-02-02T05:46:30Z,fYX_Mru5tNw,"[자막뉴스] ""한국 핵무장, 전 세계 소프트파워 파괴"" 강력한 경고 / YTN",미국의 지그프리드 해커 박사는 한국의 핵무장을 주제로 한 세미나에서 한국의 과학기술...,UChlgI3UHCOnwUGzWzbJ3H5w,YTN,"['YTN실시간', 'YTN']",25,PT2M32S,1911736,11577.0,10295.0,,,,-Cy8CIs76Ylr0SvilBHAxD4hBsk,2023-01-01,2023-03-31 23:59:59,한국 핵무장,2023-05-01 01:06:25
1,2023-02-13T05:10:50Z,9fS-DF4xn5k,과거 프랑스 드골의 핵무장 논리 지금 한국에 맞는가...,서울의소리 유튜브 계정은 지난 6월부터 모든 수익 창출이 정지되었습니다. 서울의소리...,UCUxTPRSns--l5BX2537u7Rw,서울의소리 Voice of Seoul,"['서울의소리', '초심', '대한민국', '서울', '백은종', '유튜브', '유...",25,PT59S,7173,461.0,15.0,,,,29pMZ-qbK8A4x17znzWlGomYL2s,2023-01-01,2023-03-31 23:59:59,한국 핵무장,2023-05-01 01:06:25
2,2023-01-28T03:08:32Z,DjX42uSww9I,'한국 핵무장론' 미국의 심층 보도...보고서 내용 공개 / YTN,"CNN, 한국 핵무장론 증가 배경 보도 \n""핵무장 발언은 국내 지지층 의식한 포퓰...",UChlgI3UHCOnwUGzWzbJ3H5w,YTN,"['source:영상', 'type:방송', 'genre:정치', 'format:기타']",25,PT3M27S,78644,720.0,680.0,,,,j_1MmGLMYSEJlMmKvDdJDlSB65A,2023-01-01,2023-03-31 23:59:59,한국 핵무장,2023-05-01 01:06:25
3,2023-03-25T12:00:07Z,WjUEDHlvlKQ,"[워싱턴 톡] 한국 핵무장, 미국 전략적 이익에 부합하나? 예외 인정될까?",북한이 각종 무기 실험으로 한국의 방어망을 무력화하려고 하지만 실전에서 미군 증원과...,UC8d0ZgFEl4AUdyAKGE8O3yg,VOA 한국어,"['VOA', '미국의소리', '뉴스']",25,PT25M1S,58259,2205.0,656.0,,,,lZRSvc19rxk5pId1X5wSdZheo2w,2023-01-01,2023-03-31 23:59:59,한국 핵무장,2023-05-01 01:06:25
4,2023-01-04T06:49:15Z,pbst6XB7GVY,"[자막뉴스] ""한국은 핵 보유국이..."" 미국의 강력한 한마디 / YTN",백악관은 조 바이든 대통령이 한미 핵 공동 연습 계획이 없다고 말한 건 한국이 핵 ...,UChlgI3UHCOnwUGzWzbJ3H5w,YTN,"['source:영상', 'type:디지털', 'genre:국제', 'format:...",25,PT1M59S,1596609,11012.0,4858.0,,,,hw10zs-q2HRfRWnoznTTZiYHHOo,2023-01-01,2023-03-31 23:59:59,한국 핵무장,2023-05-01 01:06:25


In [192]:
yt = YouTubeHelper()
comments_df = yt.load_comments(comments_dir=COMMENTS_DIR)
len(comments_df)

# Clean comments
comments_df['comment_clean'] = comments_df['comment_text'].str.replace(r"[ㅋㅎㅠㅜㅡ~;\-\n\s]+", ' ', regex=True)
comments_df['comment_clean'] = comments_df['comment_clean'].str.replace(r"[\.]+", '.', regex=True)
comments_df['comment_clean'] = comments_df['comment_clean'].str.replace(r"[!]+", '!', regex=True)
comments_df['comment_clean'] = comments_df['comment_clean'].str.replace(r"[?]+", '?', regex=True)

# Remove short comments (less than 5 characters)
comments_df = comments_df[comments_df['comment_clean'].str.len() > 5]


In [207]:
comments_df.head(1)

Unnamed: 0,published_at,updated_at,video_id,comment_id,parent_id,is_top_level_comment,comment_text,author_display_name,author_channel_id,author_profile_image_url,author_channel_url,like_count,etag,comment_clean
0,2022-01-16T20:01:41Z,2022-01-16T20:01:41Z,aqtEB3rhK1Y,UgwctwS5s8vaEL2yrBh4AaABAg,,True,기생충은 영화그 자체로 수상하죠. 아젠다를 전달했으니..우리나라문화에 ‘그들이’개입...,Jihye Kim,UC5ZPN7UxrY93fsncafN4oqQ,https://yt3.ggpht.com/ytc/AGIKgqMQlvUMZ9FkLcEh...,http://www.youtube.com/channel/UC5ZPN7UxrY93fs...,2,3i_VKQb9rwjxFEQpKENN_Up6EDI,기생충은 영화그 자체로 수상하죠. 아젠다를 전달했으니.우리나라문화에 ‘그들이’개입했...


In [194]:
docs = comments_df['comment_clean'].tolist()

In [195]:
"""
https://www.sbert.net/docs/pretrained_models.html
"""

# sentence_transformer_model = 'paraphrase-multilingual-mpnet-base-v2'
sentence_transformer_model = 'paraphrase-multilingual-MiniLM-L12-v2'
sentence_model = SentenceTransformer(sentence_transformer_model, device='mps')

In [26]:
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/9422 [00:00<?, ?it/s]

In [27]:
with open(DATA_DIR / 'embeddings_v1.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [197]:
sample = docs[:10000]

In [198]:
sample_embeddings = sentence_model.encode(sample, show_progress_bar=True)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [34]:
class KoreanTokenizer:
    """
    Utilize KoNLPy Mecab for tokenization:
    https://konlpy.org/en/latest/
    
    Alternative package with only Mecab:
    https://github.com/jonghwanhyeon/python-mecab-ko
    https://www.lesbonscomptes.com/recoll/pages/recoll-korean.html
    """
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.morphs(sent)
        results = [word for word in word_tokens if len(word) > 1]
        return results

In [199]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42,
)

hdbscan_model = HDBSCAN(
    min_cluster_size=25,
    min_samples=10,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=False,
)

# vectorizer_model = CountVectorizer(
#     stop_words='english',
#     ngram_range=(1, 2),
# )

custom_tokenizer = KoreanTokenizer(MeCab())

vectorizer_model = CountVectorizer(
    tokenizer=custom_tokenizer,
)

mmr_model = MaximalMarginalRelevance(
    diversity=0.5,
)

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    # representation_model=mmr_model,
    calculate_probabilities=False,
    verbose=True,
)

In [166]:
# topics, probs = topic_model.fit_transform(docs, embeddings)

In [200]:
topics, probs = topic_model.fit_transform(sample, sample_embeddings)

2023-05-01 17:28:19,082 - BERTopic - Reduced dimensionality
2023-05-01 17:28:19,207 - BERTopic - Clustered reduced embeddings


In [201]:
topics_df = topic_model.get_topic_info()
print(len(topics_df))
topics_df.head(20)

57


Unnamed: 0,Topic,Count,Name
0,-1,3353,-1_중국_한국_나라_일본
1,0,1257,0_롯데_진짜_불매_해라
2,1,696,1_롯데_사람_불매_게임
3,2,619,2_핵무기_미국_개발_무장
4,3,318,3_북한_미국_무장_핵무기
5,4,280,4_중국_한국_중국인_한국인
6,5,182,5_중국_이해_나라_중국인
7,6,182,6_중국_중국인_추방_보내
8,7,124,7_기업_일본_한국_롯데
9,8,118,8_한국_북한_대한민국_아니


In [202]:
topic_model.visualize_documents(sample, embeddings=sample_embeddings)