In [None]:
import sys
import pandas as pd
from tqdm import tqdm
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import warnings
from bertopic import BERTopic
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
from umap import UMAP
import tomotopy as tp
import re
import time
import torch
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')

2024-11-19 14:49:15.591738: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, text):
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        # Tokenize the text
        tokens = text.split()
        tokens = [self.tagger.stem(token) for token in tokens if token not in stop_words and len(token)>2]
        
        return tokens

In [None]:
# remove stopwords
stop_words_file = '../Datasets/stopwords_en.txt'
stop_words = set([line.strip() for line in open(stop_words_file, encoding="utf-8").readlines()])

def remove_stopwords(words, stop_words):
    return [word for word in words if word not in stop_words]

In [None]:
# enter the dataset name
dataset_name = 'arxiv'
data = pd.read_feather(f'../Datasets/{dataset_name}/{dataset_name}.ftr')
texts = [remove_stopwords(text, stop_words) for text in data.words]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
docs = data.document.to_list()

In [None]:
# train BERTopic
def train_bertopic_model(docs):
    umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=33)
    custom_tokenizer = CustomTokenizer(PorterStemmer())
    vectorizer = CountVectorizer(tokenizer = custom_tokenizer)
    model = BERTopic(
     top_n_words=30,
     umap_model = umap_model,
     vectorizer_model = vectorizer,
     ) 
    topic, prob = model.fit_transform(docs)
    model.save(f'../models/{dataset_name}/bertopic')
    return model

In [None]:
# train hLDA
def train_hlda_model(texts):
    cps = tp.utils.Corpus()
    for words in texts:
        doc_rm = []
        for word in words:
            doc_rm.append(word)
        cps.add_doc(doc_rm)
        
    mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=3, corpus=cps)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    
    for _ in range(0, 1000, 10):
        mdl.train(7)
        mdl.train(3, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    for _ in range(0, 100, 10):
        mdl.train(10, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))

    mdl.summary(topic_word_top_n=20)
    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(f'../models/{dataset_name}/hlda.bin', True)

In [None]:
start_time = time.time()
bertopic_model = train_bertopic_model(docs)
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: {:.4f} seconds".format(execution_time))

In [None]:
start_time = time.time()
hlda_model = train_hlda_model(texts)
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: {:.4f} seconds".format(execution_time))
