In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import pickle

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.dimensionality import BaseDimensionalityReduction

from sklearn.feature_extraction.text import CountVectorizer
from cuml.manifold import UMAP
from cuml.decomposition import PCA, TruncatedSVD 
import random
import time
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import KeyedVectors, Word2Vec, Doc2Vec

import torch
from itertools import combinations

import numpy as np
import fasttext

from cuml.cluster import HDBSCAN, AgglomerativeClustering, DBSCAN, KMeans
from sklearn.cluster import SpectralClustering

from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

import IPython.display as ipd
import gc
import warnings
warnings.filterwarnings(action='ignore')

2025-05-10 15:59:50.244376: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 15:59:50.257008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746871190.271213   81381 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746871190.274986   81381 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10 15:59:50.288672: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
print(torch.cuda.is_available())

True


## Data Prep

In [4]:
with open('./pickled/banfake_string.pickle', 'rb') as f:
    docs = pickle.load(f)

with open('./pickled/bn_stopwords.pickle', 'rb') as f:
    bn_stopwords = pickle.load(f)

num_topics = 12

In [5]:
random.shuffle(docs)
len(docs)

48678

## Functions

In [6]:
def bengali_tokenizer(text):
    return text.split()

In [7]:
class GensimWord2VecWrapper:
    def __init__(self, model):
        self.model = model

    def transform(self, documents):
        embeddings = []
        for doc in documents:
            tokens = doc.split()
            doc_vector = np.mean(
                [self.model.wv[word] for word in tokens if word in self.model.wv],
                axis=0,
                keepdims=True,
            )
        
            if doc_vector.size == 0:
                doc_vector = np.zeros((1, self.model.vector_size))
            embeddings.append(doc_vector)
        return np.vstack(embeddings)

In [8]:
def topic_modeling(docs, embedding, model_path, dim_reduc, clustering, nr_topics=num_topics):
    
    vectorizer_model = CountVectorizer(
        stop_words=bn_stopwords,
        ngram_range=(1, 1),
        tokenizer=bengali_tokenizer
    )
    
    representation_model = KeyBERTInspired()

    # Embedding Model
    if embedding == 'w2v':
        w2v_model = Word2Vec.load(model_path)
        embedding_model = GensimWord2VecWrapper(w2v_model)
    elif embedding == 'd2v':
        d2v_model = Doc2Vec.load(model_path)
        embedding_model = GensimWord2VecWrapper(d2v_model)
    elif embedding == 'glove':
        embedding_model = KeyedVectors.load_word2vec_format(model_path, binary=False, no_header=True)
    elif embedding == 'ft':
        embedding_model = fasttext.load_model(model_path)
    elif embedding == 'st':
        embedding_model = SentenceTransformer(model_path, device='cuda')


    # Dimensionality Reduction
    if dim_reduc == 'umap':
        umap_model = UMAP(n_components=10, metric='cosine') 
    elif dim_reduc == 'pca':
        umap_model = PCA(n_components=128)
    elif dim_reduc == 'svd':
        umap_model = TruncatedSVD(n_components=100)
    else:
        umap_model = BaseDimensionalityReduction()



    # Clustering Models
    if clustering == 'km':
        clustering_model = KMeans(n_clusters=nr_topics, metric='cosine')
    elif clustering == 'agg':
        clustering_model = AgglomerativeClustering(n_clusters=nr_topics, metric='cosine')
    elif clustering == 'db':
        clustering_model = DBSCAN()
    elif clustering == 'hdb':
        clustering_model = HDBSCAN()
    elif clustering == 'spec':
        clustering_model = SpectralClustering(n_clusters=nr_topics, eigen_solver='amg')


    topic_model = BERTopic(
        embedding_model=embedding_model, 
        hdbscan_model=clustering_model,
        umap_model=umap_model,
        representation_model=representation_model,
        vectorizer_model=vectorizer_model,
        nr_topics=nr_topics,
        verbose=True
    )
    
    topics, probs = topic_model.fit_transform(docs)


    return topic_model

In [9]:
def evaluate_topic_words(topic_words, docs):
    
    tokenized_docs = [bengali_tokenizer(doc) for doc in docs]
    texts = [doc for doc in tokenized_docs if len(doc) > 0]
    
    dictionary = Dictionary(texts)

    results = {}
    
    if texts and dictionary:
        
        coherence_c_v = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        coherence_npmi = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_npmi').get_coherence()
        
        results['coherence_c_v'] = coherence_c_v
        results['coherence_npmi'] = coherence_npmi
    
    octis = {
        'topics': topic_words  
    }
    
    td = TopicDiversity(topk=10)
    irbo = InvertedRBO(topk=10)
    
    results['topic_diversity'] = td.score(octis)
    results['IRBO'] = irbo.score(octis)
    
    return results

In [10]:
gc.collect()
torch.cuda.empty_cache()

In [11]:
start_time = time.time() 

topic_model = topic_modeling(docs=docs,
                             embedding='st',
                             model_path="shihab17/bangla-sentence-transformer", 
                             dim_reduc='umap',
                             clustering='hdb')

end_time = time.time()
runtime = end_time - start_time

2025-05-10 16:00:07,411 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1522 [00:00<?, ?it/s]

2025-05-10 16:05:50,419 - BERTopic - Embedding - Completed ✓
2025-05-10 16:05:50,420 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-10 16:05:53,157 - BERTopic - Dimensionality - Completed ✓
2025-05-10 16:05:53,159 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-10 16:05:54,149 - BERTopic - Cluster - Completed ✓
2025-05-10 16:05:54,150 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-10 16:05:58,663 - BERTopic - Representation - Completed ✓
2025-05-10 16:05:58,670 - BERTopic - Topic reduction - Reducing number of topics
2025-05-10 16:05:58,752 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-10 16:06:04,697 - BERTopic - Representation - Completed ✓
2025-05-10 16:06:04,714 - BERTopic - Topic reduction - Reduced number of topics from 766 to 12


In [12]:
df = topic_model.get_topic_info()

topic_words = df['Representation'].to_list()

for words in topic_words:
    print(f'{topic_words.index(words)} : {words}')

0 : ['বিএনপি', 'বিএনপির', 'নির্বাচনে', 'নির্বাচন', 'রানে', 'বাংলাদেশের', 'আওয়ামী', 'লীগের', 'বাংলাদেশ', 'রান']
1 : ['বিএনপি', 'বিএনপির', 'নির্বাচন', 'চৌধুরী', 'আওয়ামী', 'জিয়ার', 'রহমান', 'শেখ', 'ম্যাচে', 'রান']
2 : ['ঘূর্ণিঝড়', 'ঘূর্ণিঝড়ে', 'ঘূর্ণিঝড়টি', 'ভূমিকম্পের', 'ভূমিকম্পে', 'নিম্নচাপ', 'ঘূর্ণিঝড়ের', 'বৃষ্টিপাত', 'ভূমিকম্প', 'সমুদ্রবন্দর']
3 : ['ব্যাংকিং', 'আর্থিক', 'ব্যাংকের', 'ব্যাংক', 'অর্থনৈতিক', 'দুর্নীতি', 'টাকা', 'অর্থ', 'দাম', 'বিনিয়োগ']
4 : ['চিনি', 'চামচ', 'মরিচ', 'খাবার', 'খাওয়া', 'হলুদ', 'রস', 'পেঁয়াজ', 'খাওয়ার', 'ডিম']
5 : ['স্মার্টফোন', 'আইফোন', 'অ্যাপ', 'মোবাইল', 'প্রযুক্তি', 'ল্যাপটপ', 'ইন্টারনেট', 'অ্যান্ড্রয়েড', 'অনলাইন', 'ব্যবহারকারীদের']
6 : ['বিজ্ঞানীরা', 'বিজ্ঞানী', 'বিজ্ঞানীর', 'বিজয়ীদের', 'পদার্থবিজ্ঞানী', 'বিজয়ী', 'পুরস্কার', 'পুরস্কারের', 'রসায়ন', 'পদার্থবিদ্যায়']
7 : ['কানাডাকে', 'কানাডাতে', 'কানাডার', 'কানাডা', 'মানবতাবিরোধী', 'রোহিঙ্গা', 'প্রধানমন্ত্রী', 'কমন্স', 'গণহত্যার', 'চিকে']
8 : ['বাবা', 'মোহাম্মদ', 'মুহাম্মদ', 'জনপ্রিয়', 'সন্

In [13]:
gc.collect()
torch.cuda.empty_cache()

In [14]:
topic_words

[['বিএনপি',
  'বিএনপির',
  'নির্বাচনে',
  'নির্বাচন',
  'রানে',
  'বাংলাদেশের',
  'আওয়ামী',
  'লীগের',
  'বাংলাদেশ',
  'রান'],
 ['বিএনপি',
  'বিএনপির',
  'নির্বাচন',
  'চৌধুরী',
  'আওয়ামী',
  'জিয়ার',
  'রহমান',
  'শেখ',
  'ম্যাচে',
  'রান'],
 ['ঘূর্ণিঝড়',
  'ঘূর্ণিঝড়ে',
  'ঘূর্ণিঝড়টি',
  'ভূমিকম্পের',
  'ভূমিকম্পে',
  'নিম্নচাপ',
  'ঘূর্ণিঝড়ের',
  'বৃষ্টিপাত',
  'ভূমিকম্প',
  'সমুদ্রবন্দর'],
 ['ব্যাংকিং',
  'আর্থিক',
  'ব্যাংকের',
  'ব্যাংক',
  'অর্থনৈতিক',
  'দুর্নীতি',
  'টাকা',
  'অর্থ',
  'দাম',
  'বিনিয়োগ'],
 ['চিনি',
  'চামচ',
  'মরিচ',
  'খাবার',
  'খাওয়া',
  'হলুদ',
  'রস',
  'পেঁয়াজ',
  'খাওয়ার',
  'ডিম'],
 ['স্মার্টফোন',
  'আইফোন',
  'অ্যাপ',
  'মোবাইল',
  'প্রযুক্তি',
  'ল্যাপটপ',
  'ইন্টারনেট',
  'অ্যান্ড্রয়েড',
  'অনলাইন',
  'ব্যবহারকারীদের'],
 ['বিজ্ঞানীরা',
  'বিজ্ঞানী',
  'বিজ্ঞানীর',
  'বিজয়ীদের',
  'পদার্থবিজ্ঞানী',
  'বিজয়ী',
  'পুরস্কার',
  'পুরস্কারের',
  'রসায়ন',
  'পদার্থবিদ্যায়'],
 ['কানাডাকে',
  'কানাডাতে',
  'কানাডার',
  'কানাডা',
  'মানবতাবির

In [15]:
runtime

362.98661160469055

In [16]:
# results = evaluate_topic_words(topic_words, docs)
# results['runtime'] = runtime
# results

In [17]:
duration = 2  
sampling_rate = 44100
frequency = 440.0 
t = np.linspace(0, duration, int(sampling_rate * duration), False)
wave = 0.5 * np.sin(2 * np.pi * frequency * t)

audio = ipd.Audio(wave, rate=sampling_rate, autoplay=True)
display(audio)

In [18]:
#topic_model.visualize_topics()

In [19]:
#topic_model.visualize_documents(docs)

In [20]:
#topic_model.visualize_barchart()

In [21]:
#topic_model.visualize_heatmap()