In [1]:
#!conda install -c rapidsai -c nvidia -c conda-forge cuml=24.04

In [2]:
#!pip install pyamg

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import pickle
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import random
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

from sklearn.cluster import SpectralClustering
import time
import warnings
warnings.filterwarnings(action='ignore')

from scipy.sparse import coo_matrix
from cuml.manifold import UMAP as cuUMAP
from cuml.decomposition import PCA as cuPCA, TruncatedSVD as cuTruncatedSVD
from cuml.neighbors import NearestNeighbors as cuNearestNeighbors

import cupy as cp
import numpy as np
import torch
import gc

from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO
import IPython.display as ipd

2025-05-10 01:36:14.814561: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 01:36:15.071744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746819375.177699  335077 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746819375.212401  335077 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10 01:36:15.459588: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [5]:
print(torch.cuda.is_available())

True


## Data Prep

In [6]:
with open('./pickled/banfake_string.pickle', 'rb') as f:
    docs = pickle.load(f)

with open('./pickled/bn_stopwords.pickle', 'rb') as f:
    bn_stopwords = pickle.load(f)

num_topics = 12

In [7]:
random.shuffle(docs)

In [8]:
#docs= docs[:5000]

## Functions

In [9]:
def bengali_tokenizer(text):
    return text.split()

In [10]:
def topic_modeling(docs, model_path, dim_reduc=None, nr_topics=num_topics):
    
    vectorizer_model = CountVectorizer(
        stop_words=bn_stopwords,
        ngram_range=(1, 1),
        tokenizer=bengali_tokenizer
    )
    representation_model = KeyBERTInspired()
    
    # Generate embeddings on GPU
    embedder = SentenceTransformer(model_path, device='cuda')
    embeddings = embedder.encode(docs, device='cuda').astype(np.float32)
    gdf_data = cp.asarray(embeddings)

    
    if dim_reduc == 'umap':
        reducer = cuUMAP(n_components=10, metric='cosine')
    elif dim_reduc == 'pca':
        reducer = cuPCA(n_components=128)
    elif dim_reduc == 'svd':
        reducer = cuTruncatedSVD(n_components=100)
    else:
        reducer = None

    if reducer:
        gdf_reduced = reducer.fit_transform(gdf_data)
    else:
        gdf_reduced = gdf_data

    
    nn = cuNearestNeighbors(n_neighbors=15, metric="cosine")
    nn.fit(gdf_reduced)
    distances, indices = nn.kneighbors(gdf_reduced)
    

    indices = indices.get()
    distances = distances.get()
    n_samples = indices.shape[0]
    
    # Clean up
    del gdf_data, gdf_reduced
    torch.cuda.empty_cache()
    gc.collect()

    
    rows, cols, data = [], [], []
    for i in range(n_samples):
        for j in range(1, 10):  
            idx = indices[i, j]
            sim = 1 - distances[i, j]
            rows.extend([i, idx])
            cols.extend([idx, i])
            data.extend([sim, sim])

    affinity = coo_matrix((data, (rows, cols)), shape=(n_samples, n_samples)).tocsr()
    
    # Clean up
    del rows, cols, data, indices, distances
    gc.collect()

    
    clustering = SpectralClustering(
        n_clusters=nr_topics,
        eigen_solver='amg',
        affinity='precomputed',
        random_state=42
    ).fit(affinity)
    labels = clustering.labels_


    topic_model = BERTopic(
        embedding_model=embedder,
        umap_model=reducer,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        nr_topics=nr_topics,
        verbose=True
    )


    if isinstance(embeddings, cp.ndarray):
        embeddings = embeddings.get()
    topics, probs = topic_model.fit_transform(docs, embeddings=embeddings, y=labels)

    return topic_model

In [11]:
def evaluate_topic_words(topic_words, docs):
    
    tokenized_docs = [bengali_tokenizer(doc) for doc in docs]
    texts = [doc for doc in tokenized_docs if len(doc) > 0]
    
    dictionary = Dictionary(texts)

    results = {}
    
    if texts and dictionary:
        
        coherence_c_v = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        coherence_npmi = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_npmi').get_coherence()
        
        results['coherence_c_v'] = coherence_c_v
        results['coherence_npmi'] = coherence_npmi
    
    octis = {
        'topics': topic_words  
    }
    
    td = TopicDiversity(topk=10)
    irbo = InvertedRBO(topk=10)
    
    results['topic_diversity'] = td.score(octis)
    results['IRBO'] = irbo.score(octis)
    
    return results

In [None]:
start_time = time.time() 
topic_model = topic_modeling(docs=docs,
                             model_path="shihab17/bangla-sentence-transformer", 
                             dim_reduc='umap')

end_time = time.time()
runtime = end_time - start_time

In [None]:
df = topic_model.get_topic_info()
topic_words = df['Representation'].to_list()

In [None]:
for words in topic_words:
    print(f'{topic_words.index(words)} : {words}')

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
results = evaluate_topic_words(topic_words, docs)
results['runtime'] = runtime
results

In [None]:
#topic_model.visualize_topics()

In [None]:
#topic_model.visualize_documents(docs)

In [None]:
#topic_model.visualize_barchart()

In [None]:
#topic_model.visualize_heatmap()

In [None]:
###########################################################################################################################################

In [None]:
duration = 2  
sampling_rate = 44100
frequency = 440.0 
t = np.linspace(0, duration, int(sampling_rate * duration), False)
wave = 0.5 * np.sin(2 * np.pi * frequency * t)

audio = ipd.Audio(wave, rate=sampling_rate, autoplay=True)
display(audio)