In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph
from sklearn.decomposition import NMF
import torch
from torch_geometric.data import Data
import pickle
import torch.nn as nn
from torch_geometric.nn.norm import GraphNorm
from torch_geometric.utils import dropout_edge
import torch.nn.functional as F
from torch_geometric.nn import ClusterGCNConv
from gensim.models.coherencemodel import CoherenceModel
import random
from gensim.corpora.dictionary import Dictionary
import os
from torch_geometric.loader import ClusterData, ClusterLoader
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)  

    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(42)

## GloVe and Data Load

In [3]:
glove = {}
with open('./model/bn_glove.39M.300d.txt', encoding='utf8') as f:
    for L in f:
        parts = L.rstrip().split()
        glove[parts[0]] = np.array(parts[1:], dtype=float)
emb_dim = len(next(iter(glove.values())))

In [4]:
with open('./data/nctbtext_processed.pickle', 'rb') as f:
    docs = pickle.load(f)
random.shuffle(docs)

num_topics = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utilities

In [5]:
def bengali_tokenizer(text):
    return text.split()

In [6]:
def evaluate_topic_words(topic_words, texts):
    
    dictionary = Dictionary(texts)

    results = {}
    
    if texts and dictionary:
        
        coherence_c_v = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        coherence_npmi = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_npmi').get_coherence()
        
        results['coherence_c_v'] = coherence_c_v
        results['coherence_npmi'] = coherence_npmi
    
    octis = {
        'topics': topic_words  
    }
    
    td = TopicDiversity(topk=10)
    irbo = InvertedRBO(topk=10)
    
    results['topic_diversity'] = td.score(octis)
    results['IRBO'] = irbo.score(octis)
    
    return results

In [7]:
def build_knn_graph(embeddings: np.ndarray, k: int):
    """
    embeddings: [num_nodes, 300] numpy array of text embeddings
    k: number of neighbors for k-NN
    returns: edge_index tensor for PyG
    """
    A = kneighbors_graph(embeddings, n_neighbors=k, mode='distance', metric='cosine', n_jobs=-1, include_self=True)      

    A = A.tocoo()
    edge_index = torch.tensor([A.row, A.col], dtype=torch.long)
    x = torch.tensor(embeddings, dtype=torch.float)
    
    return Data(x=x, edge_index=edge_index)

In [8]:
class ClusterGCNNet(torch.nn.Module):
    def __init__(
        self,
        in_dim: int,
        hidden_dim: int,
        out_dim: int,
        dropout: float = 0.4,
        edge_dropout: float = 0.2,      
    ):
        super().__init__()
        self.input_drop = nn.Dropout(p=dropout)            
    
        self.convs = nn.ModuleList([
            ClusterGCNConv(in_dim, hidden_dim),
            ClusterGCNConv(hidden_dim, hidden_dim),
            ClusterGCNConv(hidden_dim, hidden_dim),
            ClusterGCNConv(hidden_dim, out_dim),
        ])
    
        self.norms = nn.ModuleList([
            GraphNorm(hidden_dim),
            GraphNorm(hidden_dim),
            GraphNorm(hidden_dim),
            GraphNorm(out_dim),
        ])                                                
        self.dropout = dropout
        self.edge_dropout = edge_dropout

    def forward(self, x, edge_index):

        for i, conv in enumerate(self.convs):
        
            edge_index, _ = dropout_edge(
                edge_index, p=self.edge_dropout, training=self.training
            )                                             
 
            h = conv(x, edge_index)

            h = self.norms[i](h)

            if i != len(self.convs) - 1:          
                h = F.relu(h)
                h = F.dropout(h, p=self.dropout, training=self.training)

            if h.shape == x.shape:
                h = h + x                                            

            x = h

        return x  

In [9]:
def get_cluster_loader(data, num_clusters, batch_size):
    """
    data: PyG Data object
    num_clusters: into how many partitions to split the full graph
    batch_size: how many clusters per mini-batch
    """
    
    cluster_data = ClusterData(data, num_parts=num_clusters, keep_inter_cluster_edges=True)   
    loader = ClusterLoader(cluster_data, batch_size=batch_size, shuffle=True)
    
    return loader

In [10]:
def hinge_edge_loss(z, edge_index, num_neg_samples=5, margin=1.0):
    """
    z: [N, out_dim] node embeddings
    edge_index: [2, E] positive edges in this mini-batch
    For each positive edge (i,j), sample `num_neg_samples` negative j'.
    """
    
    src, dst = edge_index     

    pos_score = F.cosine_similarity(z[src], z[dst], dim=1)

    E = dst.size(0)
    neg_dst = dst[torch.randint(0, E, (E * num_neg_samples,))]

    neg_src = src.repeat_interleave(num_neg_samples)   

    neg_score = F.cosine_similarity(z[neg_src], z[neg_dst], dim=1)

    loss = torch.relu(margin - pos_score.repeat(num_neg_samples) + neg_score)

    return loss.mean()


def contrastive_loss(z, temperature=0.5):
    """
    z: [N_b, d] embeddings for the batch
    Returns NT-Xent loss that pulls each embedding towards itself under 
    two random augmentations (here we simply treat each node once).
    """

    z_norm = F.normalize(z, p=2, dim=1)                         

    sim = torch.matmul(z_norm, z_norm.T) / temperature         

    mask = torch.eye(sim.size(0), device=sim.device).bool()
    sim.masked_fill_(mask, -9e15)

    labels = torch.arange(sim.size(0), device=sim.device)

    return F.cross_entropy(sim, labels)                        


def joint_loss(z, edge_index):
  
    loss_hinge = hinge_edge_loss(z, edge_index)

    loss_con = contrastive_loss(z)

    return loss_hinge + loss_con

In [11]:
def train(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        z = model(batch.x, batch.edge_index)
        loss = joint_loss(z,batch.edge_index)

        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_nodes
    return total_loss / len(loader.dataset)

def extract_embeddings(model, data):
    model.eval()                                   
    data = data.to(device)
    with torch.no_grad():                         
        z = model(data.x, data.edge_index)  
        
    return z.cpu().numpy()

## Config

In [12]:
hidden_dim = 32
output_dim = 64

n_neighbors = 15
num_clusters = 8

epochs = 100
lr=0.005
batch_size = 32

## GHTM Pipeline

In [13]:
vectorizer = TfidfVectorizer(
    preprocessor=None,
    token_pattern=None,
    lowercase=False,
    tokenizer=bengali_tokenizer,
    ngram_range=(1,1)
)

docs_as_strings = [' '.join(doc) for doc in docs]

sparse = vectorizer.fit_transform(docs_as_strings)
terms = vectorizer.get_feature_names_out()

In [14]:
G = np.array([glove.get(t, np.zeros(emb_dim)) for t in terms])
doc_embeds = sparse.dot(G)

In [15]:
data = build_knn_graph(doc_embeds, k=n_neighbors)

loader = get_cluster_loader(data, num_clusters=num_clusters, batch_size=batch_size)
model = ClusterGCNNet(doc_embeds.shape[1], hidden_dim=hidden_dim, out_dim=output_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for epoch in range(epochs):
    loss = train(model, loader, optimizer, device)
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.2f}")

  edge_index = torch.tensor([A.row, A.col], dtype=torch.long)
Computing METIS partitioning...
Done!


Epoch 0, Loss: 9731271859529318400.00
Epoch 10, Loss: 9731271859529318400.00
Epoch 20, Loss: 9731271859529318400.00
Epoch 30, Loss: 9731271859529318400.00
Epoch 40, Loss: 9731271859529318400.00
Epoch 50, Loss: 9731271859529318400.00
Epoch 60, Loss: 9731271859529318400.00
Epoch 70, Loss: 9731271859529318400.00
Epoch 80, Loss: 9731271859529318400.00
Epoch 90, Loss: 9731271859529318400.00


In [16]:
reduced = extract_embeddings(model, data)
non_neg_embeddings = np.abs(reduced)

In [17]:
topic_words = []
nmf = NMF(n_components=num_topics, random_state=42, init='nndsvda', max_iter=500)
doc_topic = nmf.fit_transform(non_neg_embeddings)       

In [18]:
topn_words = 10
for t in range(num_topics):
    best_docs = np.argsort(doc_topic[:, t])[::-1][:10]
    
    topic_term_weights = np.asarray(sparse[best_docs].sum(axis=0)).ravel()
    
    top_terms = terms[np.argsort(topic_term_weights)[::-1][:topn_words]]
    topic_words.append(top_terms.tolist())
    print(f"Topic {t:>2}:", top_terms.tolist())

Topic  0: ['প্রবৃদ্ধি', 'দেশ', 'উন্নয়ন', 'খাত', 'শতাংশ', 'হার', 'অর্থনৈতিক', 'উন্নয়নশীল', 'জিডিপি', 'কৃষি']
Topic  1: ['টাকা', 'ব্যাংক', 'হিসাব', 'নগদ', 'আমানত', 'লেনদেন', 'ট্রেডার্স', 'নগদান', 'প্রাক্কলিত', 'চেক']
Topic  2: ['ভিটামিন', 'মাংস', 'মাশরুম', 'খাদ্য', 'উৎস', 'মাছ', 'আটা', 'দ্রবণীয়', 'পরিমাণ', 'খাবার']
Topic  3: ['লীগ', 'পাকিস্তান', 'নির্বাচন', 'মুসলিম', 'আওয়ামী', 'দল', 'সাল', 'আসন', 'পরিষদ', 'যুক্তফ্রন্ট']
Topic  4: ['কুরআন', 'আল', 'হাদিস', 'নবি', 'আল্লাহ', 'কিতাব', 'রাসুল', 'তায়ালা', 'হযরত', 'ওহি']
Topic  5: ['বৃষ্টিপাত', 'তাপমাত্রা', 'বাংলাদেশ', 'বন্যা', 'জলবায়ু', 'বায়ু', 'মৌসুমি', 'সেলসিয়াস', 'শীতকাল', 'বর্ষাকাল']
Topic  6: ['ফোন', 'মোবাইল', 'কার্ড', 'প্রজন্ম', 'প্রযুক্তি', 'নেটওয়ার্ক', 'যোগাযোগ', 'সুইচিং', 'ডিভাইস', 'ওয়্যারলেস']
Topic  7: ['জাত', 'বীজ', 'ধান', 'ব্রি', 'বপন', 'চারা', 'ফসল', 'চাষ', 'জমি', 'রোপণ']


## Evaluation

In [19]:
results = evaluate_topic_words(topic_words, docs)
results

{'coherence_c_v': 0.8375407594239899,
 'coherence_npmi': 0.24323283050042696,
 'topic_diversity': 1.0,
 'IRBO': 1.0}