In [1]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from nltk import tokenize
import matplotlib as mlp
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from datetime import datetime
from textwrap import dedent
import pickle

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
# import umap.plot
from umap import UMAP
from hdbscan import HDBSCAN
# from mecab import MeCab
from konlpy.tag import Mecab
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
# from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance

# from youtube_helper import YouTubeHelper

In [12]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
pd.set_option('display.max_colwidth', 100)


In [3]:
DATA_DIR = Path.cwd() / 'content'
MODEL_DIR = Path.cwd() / 'models'

In [237]:
def clean_text(text):
    clean_text = str(text).lower()
    
    
    # remove urls
    clean_text = re.sub(r"http\S+", ' ', clean_text)
    clean_text = re.sub(r"www.\S+", ' ', clean_text)
    
    news_words = [
        'voa', 'ytn', 'yes', 'top', 'news', 'tv', 'yonhapnews', 'yonhap', 'sbs', 'kbs', 'mbc', 'mbn', 'ytn', 'tkc', 'shorts',
        '연합뉴스', '국방뉴스', '연합', '방송', '뉴스쇼', '총합뉴스', '사이언스', '굿모닝', '뉴스', '모닝',
    ]
    
    for word in news_words:
        clean_text = re.sub(word, '', clean_text)
        
    # remove numbers
    clean_text = re.sub(r"\d+", ' ', clean_text)
    
    # remove/replace recurring special characters
    clean_text = re.sub(r"[ㅋㅎㅠㅜㅡ~;^@•·|,\-\n\t\s]+", ' ', clean_text)
    clean_text = re.sub(r"[-=+,#/\?:“”^$*\"※~&%ㆍ☞!』\\‘|\(\)\[\]\<\>`\'…》]", ' ', clean_text)
    clean_text = re.sub(r"[\.]+", ' ', clean_text)
    clean_text = re.sub(r"[!]+", ' ', clean_text)
    clean_text = re.sub(r"[?]+", ' ', clean_text)
    
    
    # remove multiple spaces
    clean_text = re.sub(r"\s+", ' ', clean_text)
    
    clean_text = clean_text.replace(" 년", "")
    clean_text = clean_text.replace(" 월", "")
    clean_text = clean_text.replace(" 일", "")
    
    return clean_text.strip()

In [238]:
videos = pd.read_csv(DATA_DIR / 'nuclear_videos.csv')
videos.published_at = pd.to_datetime(videos.published_at)
videos.window_start = pd.to_datetime(videos.window_start)
videos.window_end = pd.to_datetime(videos.window_end)

videos['video_title_clean'] = videos.video_title.apply(clean_text)
videos['video_description_clean'] = videos.video_description.apply(clean_text)

In [239]:
docs = videos.query("video_description_clean.str.len() > 4").video_description_clean.tolist()

In [79]:
sentence_transformer_model = 'paraphrase-multilingual-mpnet-base-v2'
sentence_model = SentenceTransformer(sentence_transformer_model, device='mps')

In [240]:
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/48 [00:00<?, ?it/s]

In [19]:
class KoreanTokenizer:
    """
    
    Mecab:
    https://github.com/jonghwanhyeon/python-mecab-ko
    https://www.lesbonscomptes.com/recoll/pages/recoll-korean.html
    """
    def __init__(self, tagger, stop_words):
        self.tagger = tagger
        self.stop_words = stop_words
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.morphs(sent)
        # word_tokens = self.tagger.nouns(sent)
        results = [word for word in word_tokens if len(word) > 1 and word not in self.stop_words]
        return results

In [None]:
"""
video titles, nn=5, nc=3, mcs=20, ms=3
"""

In [265]:
umap_model = UMAP(
    n_neighbors=20,
    n_components=3,
    min_dist=0.0,
    metric='cosine',
    random_state=42,
)

hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    min_samples=5,
    metric='euclidean',
    cluster_selection_method='eom', # 'leaf' or 'eom'
    prediction_data=True,
)

# vectorizer_model = CountVectorizer(
#     stop_words='english',
#     ngram_range=(1, 2),
# )

# https://github.com/stopwords-iso/stopwords-iso
with open(DATA_DIR / 'kr_stopwords.txt', 'r') as f:
    stop_words = f.read().splitlines()

custom_tokenizer = KoreanTokenizer(
    Mecab(),
    stop_words=stop_words,
)


vectorizer_model = CountVectorizer(
    tokenizer=custom_tokenizer,
)

mmr_model = MaximalMarginalRelevance(
    diversity=0.5,
    top_n_words=20,
)

ctfidf_model = ClassTfidfTransformer(
    bm25_weighting=True,
    reduce_frequent_words=True,
)

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=mmr_model,
    ctfidf_model=ctfidf_model,
    top_n_words=20,
    calculate_probabilities=True,
    verbose=True,
)

In [266]:
topics, probs = topic_model.fit_transform(docs, embeddings) 

2023-05-04 16:08:23,250 - BERTopic - Reduced dimensionality
2023-05-04 16:08:23,317 - BERTopic - Clustered reduced embeddings


In [267]:
topic_labels = topic_model.generate_topic_labels(nr_words=10, topic_prefix=False, separator=', ')
topic_model.set_topic_labels(topic_labels)

topics_df = topic_model.get_topic_info()
print(len(topics_df))
topics_df.head(20)

28


Unnamed: 0,Topic,Count,Name,CustomName
0,-1,507,-1_the_nuclear_정부_핵실험,"the, nuclear, 정부, 핵실험, 미사일, 핵무기, korea, 보도, 정보, north"
1,0,93,0_국당_jtbc_재배치_국회,"국당, jtbc, 재배치, 국회, 자유, 민주당, 무장, 정치, 남아공, 주장"
2,1,84,1_유엔_대사_제재_대북,"유엔, 대사, 제재, 대북, 안보리, 규탄, 스티븐, 특별, 대화, 회담"
3,2,84,2_김정은_공사_태영_신년사,"김정은, 공사, 태영, 신년사, 완성, 십니까, 정도, 질문, 주재, 남북"
4,3,58,3_트럼프_대선_공화_후보,"트럼프, 대선, 공화, 후보, 도널드, 클린턴, 미군, 힐러리, 발언, 분담금"
5,4,50,4_slbm_잠수함_사업_체계,"slbm, 잠수함, 사업, 체계, 해군, 전투기, 훈련, kf, 기술, 발사"
6,5,49,5_중국_사드_롯데_베이징,"중국, 사드, 롯데, 베이징, 그룹, 거래, the, 제재, 삼성, 혐의"
7,6,46,6_재배치_조현진_한반도_무장,"재배치, 조현진, 한반도, 무장, 확장, 한국인, 백악관, 부정, 김준형, 안규백"
8,7,43,7_플루토늄_제조_능력_최대,"플루토늄, 제조, 능력, 최대, 우라늄, 분석, 핵무기, 국장, 백서, 확보"
9,8,40,8_평화_arirang_to_korean,"평화, arirang, to, korean, president, he, 박근혜, 참전, moon, 번영"


In [268]:
docs_fig = topic_model.visualize_documents(docs, embeddings=embeddings) # , topics=range(50)
docs_fig.show()