In [1]:
#!pip install contextualized-topic-models

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import contextualized_topic_models
from contextualized_topic_models.models.ctm import CombinedTM
from sklearn.feature_extraction.text import CountVectorizer
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
import pickle
import string
from collections import Counter
from gensim.corpora.dictionary import Dictionary
from itertools import combinations
from gensim.models.coherencemodel import CoherenceModel
import time
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO
import numpy as np
import IPython.display as ipd
import warnings
import random
warnings.filterwarnings("ignore")

2025-05-11 22:31:44.068693: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-11 22:31:44.322397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746981104.436421   24187 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746981104.472607   24187 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-11 22:31:44.716549: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [4]:
with open('./pickled/bn_stopwords.pickle', 'rb') as file:
    bangla_stopwords =pickle.load(file)

with open('./pickled/dhk_tribune_tokenized.pickle', 'rb') as file:
    tokenized_docs =pickle.load(file)

with open('./pickled/dhk_tribune_string.pickle', 'rb') as file:
    raw_docs =pickle.load(file)

num_topics = 9

In [5]:
def preprocess_documents(tokenized_docs, raw_docs, stopwords_list=None, vocabulary_size=10000, max_df=1.0, min_words=1):
    """
    Preprocess a tokenized dataset without breaking Bangla words.

    :param tokenized_docs: List of tokenized documents (list of lists)
    :param raw_docs: List of unprocessed documents (list of strings)
    :param stopwords_list: List of stopwords to remove
    :param vocabulary_size: Number of most frequent words to keep
    :param max_df: Ignore terms appearing in more than max_df fraction of docs
    :param min_words: Minimum words required in a document to keep it
    :param remove_numbers: Whether to remove numbers from tokens
    :return: (preprocessed_docs, retained_unprocessed_docs, vocabulary, retained_indices)
    """

    stopwords = set(stopwords_list) if stopwords_list else set()

    tokenized_docs_filtered = [[token for token in doc if token and token not in stopwords] for doc in tokenized_docs]

    word_freq = Counter(word for doc in tokenized_docs_filtered for word in doc)
    most_frequent_words = {word for word, _ in word_freq.most_common(vocabulary_size)}

    preprocessed_docs_tmp = [" ".join([word for word in doc if word in most_frequent_words]) for doc in tokenized_docs_filtered]

    preprocessed_docs, retained_unprocessed_docs, retained_indices = [], [], []
    for i, doc in enumerate(preprocessed_docs_tmp):
        if len(doc.split()) >= min_words:
            preprocessed_docs.append(doc)
            retained_unprocessed_docs.append(raw_docs[i])
            retained_indices.append(i)

    vocabulary_list = list(most_frequent_words)

    return preprocessed_docs, retained_unprocessed_docs, vocabulary_list, retained_indices

In [6]:
def evaluate_topic_words(topic_words, texts):

    # filtered_tokenized_docs = [doc for doc in texts if len(doc) > 0]
    # texts = [word for word in filtered_tokenized_docs if word not in stopwords]

    dictionary = Dictionary(texts)

    results = {}
    
    if texts and dictionary:

        coherence_c_v = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        coherence_npmi = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_npmi').get_coherence()
        
        results['coherence_c_v'] = coherence_c_v
        results['coherence_npmi'] = coherence_npmi
    
    octis = {
        'topics': topic_words  
    }
    
    td = TopicDiversity(topk=10)
    irbo = InvertedRBO(topk=10)
    
    results['topic_diversity'] = td.score(octis)
    results['IRBO'] = irbo.score(octis)

    return results

In [7]:
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = preprocess_documents(tokenized_docs, 
                                                                                              raw_docs, 
                                                                                              stopwords_list=bangla_stopwords)

In [8]:
tp = TopicModelDataPreparation("shihab17/bangla-sentence-transformer")
start_time = time.time() 
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/388 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
#tp.vocab[:20]

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_topics)

ctm.fit(training_dataset) 
end_time = time.time() 

runtime = end_time - start_time

In [None]:
ctm.get_topic_lists(5)

In [None]:
ctm_words = ctm.get_topic_lists(10)
results = evaluate_topic_words(ctm_words, tokenized_docs)
results['runtime'] = runtime
results

In [None]:
from contextualized_topic_models.models.ctm import ZeroShotTM

ztm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_topics)

start_time = time.time() 
ztm.fit(training_dataset) 
end_time = time.time() 

runtime = end_time - start_time

In [None]:
ztm.get_topic_lists(5)

In [None]:
ztm_words = ztm.get_topic_lists(10)
results = evaluate_topic_words(ztm_words, tokenized_docs)
results['runtime'] = runtime
results

In [None]:
duration = 2  
sampling_rate = 44100
frequency = 440.0 
t = np.linspace(0, duration, int(sampling_rate * duration), False)
wave = 0.5 * np.sin(2 * np.pi * frequency * t)

audio = ipd.Audio(wave, rate=sampling_rate, autoplay=True)
display(audio)