In [1]:
import pandas as pd
from tqdm import tqdm
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import warnings
from bertopic import BERTopic
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
import tomotopy as tp
from konlpy.tag import Okt
import re
import sys
import time
warnings.filterwarnings('ignore')


In [2]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        #sent = sent[:1000000]
        hangul = re.compile('[^ 0-9가-힣+]')
        sent = hangul.sub(' ', sent)
        sent = " ".join(sent.split())
        word_tokens = self.tagger.pos(sent, stem=True)
        temp = [word[0] for word in word_tokens if (word[1] =='Adjective' or  word[1] =='Noun')]
        result = [word for word in temp if (len(word) > 1  and ( not word in stop_words))]
        #한 단어짜리 토큰 제거
        #불용어 제거
        return result

In [3]:
stop_words_file = '../datasets/RawDatasets/stopwords.txt'
stop_words =  [line.strip() for line in open(stop_words_file, encoding="utf-8").readlines()]

In [5]:
dataset_name = 'enter_dataset_name'
data = pd.read_feather(f'./datasets/{dataset_name}.ftr')
texts =data.okt
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
docs = data.corrected_twit.to_list()

In [6]:
def train_bertopic_model(docs):
    umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=33)
    custom_tokenizer = CustomTokenizer(Okt())
    vectorizer = CountVectorizer(tokenizer = custom_tokenizer)
    model = BERTopic(language = 'Korean',
         top_n_words=20,
         #nr_topics= "auto",
         #embedding_model = sentence_model,
         umap_model = umap_model,
         vectorizer_model = vectorizer,
         ) 
    topic, prob = model.fit_transform(docs)
    model.save(f'./models/{dataset_name}/bertopic')
    return model

In [7]:
def train_hlda_model(texts):
    cps = tp.utils.Corpus()
    for words in texts:
        doc_rm = []
        for word in words:
            doc_rm.append(word)
        cps.add_doc(doc_rm)
        
    mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=3, corpus=cps)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    
    for _ in range(0, 1000, 10):
        mdl.train(7)
        mdl.train(3, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    for _ in range(0, 100, 10):
        mdl.train(10, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    mdl.summary(topic_word_top_n=20)
    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(f'./models/{dataset_name}/hlda.bin', True)

In [8]:
start_time = time.time()
bertopic_model = train_bertopic_model(docs)
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: {:.4f} seconds".format(execution_time))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [13]:
bertopic_model.get_topics()

{-1: [('file', 0.004394510804181723),
  ('god', 0.0035612874206750987),
  ('program', 0.0034123604032555033),
  ('work', 0.003253150070779765),
  ('dont', 0.0032311506105375354),
  ('time', 0.0031307539940077936),
  ('peopl', 0.0030160646986339976),
  ('christian', 0.0029672223090651366),
  ('problem', 0.0029455843552837783),
  ('system', 0.0029441459883005204),
  ('im', 0.0029339787530833165),
  ('window', 0.0028847317190180757),
  ('card', 0.002774878391757264),
  ('version', 0.002759756305176075),
  ('support', 0.0027173661528780047),
  ('read', 0.0027023239462804906),
  ('run', 0.0026955167907587235),
  ('question', 0.002649384527063812),
  ('includ', 0.0026463714857358654),
  ('find', 0.0025814167438281983),
  ('entri', 0.0025678251430762016),
  ('jesu', 0.002555621319335126),
  ('set', 0.002541870903566089),
  ('bit', 0.0025346107524206016),
  ('post', 0.002531195387244785),
  ('ive', 0.002515865662020003),
  ('drive', 0.0025064709034484283),
  ('email', 0.002479088981291505),
  

In [14]:
start_time = time.time()
hlda_model=train_hlda_model(texts)
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: {:.4f} seconds".format(execution_time))


Training...


Num docs: 17999 , Vocab size: 8691 , Num words: 1201009
Removed top words: []
Iteration: 00010	ll per word: -inf	Num. of topics: 952
Iteration: 00020	ll per word: -inf	Num. of topics: 1048
Iteration: 00030	ll per word: -inf	Num. of topics: 1084
Iteration: 00040	ll per word: -inf	Num. of topics: 1153
Iteration: 00050	ll per word: -inf	Num. of topics: 1171
Iteration: 00060	ll per word: -inf	Num. of topics: 1192
Iteration: 00070	ll per word: -inf	Num. of topics: 1207
Iteration: 00080	ll per word: -inf	Num. of topics: 1225
Iteration: 00090	ll per word: -inf	Num. of topics: 1228
Iteration: 00100	ll per word: -inf	Num. of topics: 1234
Iteration: 00110	ll per word: -inf	Num. of topics: 1252
Iteration: 00120	ll per word: -inf	Num. of topics: 1258
Iteration: 00130	ll per word: -inf	Num. of topics: 1273
Iteration: 00140	ll per word: -inf	Num. of topics: 1285
Iteration: 00150	ll per word: -inf	Num. of topics: 1274
Iteration: 00160	ll per word: -inf	Num. of topics: 1283
Iteration: 00170	ll per wor

Saving...


Execution Time: 26597.3605 seconds
