In [None]:
import pandas as pd
from tqdm import tqdm
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import warnings
from bertopic import BERTopic
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
import tomotopy as tp
from konlpy.tag import Okt
import re
import sys
import time
import os
import torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pickle
warnings.filterwarnings('ignore')

2024-11-18 10:55:01.953066: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        #sent = sent[:1000000]
        hangul = re.compile('[^ 0-9가-힣+]')
        sent = hangul.sub(' ', sent)
        sent = " ".join(sent.split())
        word_tokens = self.tagger.pos(sent, stem=True)
        temp = [word[0] for word in word_tokens if (word[1] =='Adjective' or  word[1] =='Noun')]
        result = [word for word in temp if (len(word) > 1  and ( not word in stop_words))]
        return result

In [4]:
stop_words_file = '../Datasets/stopwords_kor.txt'
stop_words =  [line.strip() for line in open(stop_words_file, encoding="utf-8").readlines()]

def remove_stopwords(words, stop_words):
    return [word for word in words if word not in stop_words]

In [5]:
dataset_name = 'covid'
data = pd.read_feather(f'../Datasets/{dataset_name}/{dataset_name}.ftr')
texts =[remove_stopwords(text, stop_words) for text in data.okt]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
docs = data.corrected_twit.to_list()

In [None]:
# train BERTopic
def train_bertopic_model(docs):
    umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=34)
    custom_tokenizer = CustomTokenizer(Okt())
    vectorizer = CountVectorizer(tokenizer = custom_tokenizer)
    model = BERTopic(language = 'Korean',
         top_n_words=20,
         umap_model = umap_model,
         vectorizer_model = vectorizer,
         ) 
    topic, prob = model.fit_transform(docs)
    model.save(f'../models/{dataset_name}/bertopic')
    return model


In [None]:
# this is optional
def save_model(model, save_path, tokenizer):
    # Save the model without the tokenizer
    model.vectorizer_model.tokenizer = None
    with open(save_path, 'wb') as f:
        pickle.dump((model, tokenizer), f)

def load_model(save_path):
    with open(save_path, 'rb') as f:
        model, tokenizer = pickle.load(f)
        # Restore the tokenizer
        model.vectorizer_model.tokenizer = tokenizer
    return model

In [None]:
# train hLDA
def train_hlda_model(texts):
    cps = tp.utils.Corpus()
    for words in texts:
        doc_rm = []
        for word in words:
            doc_rm.append(word)
        cps.add_doc(doc_rm)
        
    mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=3, corpus=cps)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    
    for _ in range(0, 1000, 10):
        mdl.train(7)
        mdl.train(3, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    for _ in range(0, 100, 10):
        mdl.train(10, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    mdl.summary(topic_word_top_n=20)
    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(f'../model/{dataset_name}/hlda.bin', True)

In [18]:
start_time = time.time()
bertopic_model = train_bertopic_model(docs)
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: {:.4f} seconds".format(execution_time))

PicklingError: Can't pickle <java class 'kr.lucypark.okt.OktInterface'>: it's not found as kr.lucypark.okt.kr.lucypark.okt.OktInterface

In [None]:
start_time = time.time()
hlda_model=train_hlda_model(texts)
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: {:.4f} seconds".format(execution_time))