In [1]:
from read_data_utils import get_documents_data


en_conll = get_documents_data(
    use_ud=True,
)
urdu_conll = get_documents_data(
    '.conllu',
    use_ud=True,
    language='ur',
)
urdu_marsiya = get_documents_data(
    'dataset/marsiya-all',
    file_extension='.txt',
    language='ur',
)

  import pkg_resources
[nltk_data] Downloading package punkt_tab to /home/sali/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 16622 documents from None with extension .conllu.


Preprocessing documents:   0%|          | 0/16622 [00:00<?, ?it/s]

Loaded 5130 documents from .conllu with extension .conllu.


Preprocessing documents:   0%|          | 0/5130 [00:00<?, ?it/s]

Loaded 241 documents from dataset/marsiya-all with extension .txt.


Preprocessing documents:   0%|          | 0/241 [00:00<?, ?it/s]

In [48]:
import kagglehub
import pandas as pd


# Download latest version
path = kagglehub.dataset_download("saurabhshahane/urdu-news-dataset")
print("Path to dataset files:", path)
file_path = "urdu-news-dataset-1M.csv"
sample_path = f"{path}/{file_path}"

sample_path = f"{path}/{file_path}"

with open(sample_path, "r", errors="ignore") as f:
    df = pd.read_csv(
        f,
        engine="python",       # required for on_bad_lines
        on_bad_lines="skip",   # skip rows with wrong # of fields
        skip_blank_lines=True,
    )

print(df.shape)  # should be (≤106400, num_columns)
print(df.head())

Path to dataset files: /home/sali/.cache/kagglehub/datasets/saurabhshahane/urdu-news-dataset/versions/1
(111861, 8)
   Index                                           Headline  \
0      0  عالمی بینک عسکریت پسندی سے متاثرہ خاندانوں کی ...   
1      1  مالی سال 2020 ریٹرن فائل کرنے والوں کی تعداد م...   
2      2  جاپان کو سندھ کے خصوصی اقتصادی زون میں سرمایہ ...   
3      3   برامدات 767 فیصد بڑھ کر ارب 16 کروڑ ڈالر سے زائد   
4      4  کے الیکٹرک کو اضافی بجلی گیس کی فراہمی کے قانو...   

                                           News Text              Category  \
0  اسلام باد عالمی بینک خیبرپختونخوا کے قبائلی اض...  Business & Economics   
1  اسلام باد فیڈرل بورڈ ریونیو ایف بی نے دسمبر کی...  Business & Economics   
2  اسلام باد بورڈ انویسٹمنٹ بی او ئی کے چیئرمین ع...  Business & Economics   
3  اسلام اباد پاکستان میں ماہ نومبر میں مسلسل تیس...  Business & Economics   
4  اسلام باد نیشنل ٹرانسمیشن اینڈ ڈسپیچ کمپنی این...  Business & Economics   

         Date                       

In [49]:
urdu_news_docs = [f"{r['Headline']} {r['News Text']}" for _, r in df.iterrows() if isinstance(r['Category'], str)]
urdu_news_categories = [r['Category'] for _, r in df.iterrows() if isinstance(r['Category'], str)]
len(urdu_news_docs), len(urdu_news_categories)

(111860, 111860)

In [4]:
from collections import Counter
category_counts = Counter(urdu_news_categories)
category_counts

Counter({'Sports': 44829,
         'Entertainment': 34901,
         'Business & Economics': 24131,
         'Science & Technology': 7999})

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1,3'  # Use only the first GPU

In [4]:
multilingual_models = [
    "google-bert/bert-base-multilingual-cased",  ## Was trained on 104 languages including Urdu
    "paraphrase-multilingual-MiniLM-L12-v2", ## Smaller model, but good for multilingual tasks
    'all-MiniLM-L6-v2'
]

In [45]:
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from read_data_utils import get_urdu_stopwords
from nltk.corpus import stopwords as nltk_stopwords


def run_bert_topic(docs, language='ur'):

    embedding_model = multilingual_models[0] if language == 'ur' else multilingual_models[2]
    umap_model = UMAP(
        n_neighbors=3, 
        n_components=3, 
        min_dist=0.05
    )
    hdbscan_model = HDBSCAN(
        min_cluster_size=200, 
        min_samples=100,
        gen_min_span_tree=True,
        prediction_data=True
    )


    stopwords = get_urdu_stopwords() if language == 'ur' else list(nltk_stopwords.words('english'))

    # we add this to remove stopwords that can pollute topcs
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 3), 
        stop_words=stopwords, 
        max_df=0.9, 
        min_df=2
    )


    model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        top_n_words=5,
        calculate_probabilities=True,
        verbose=True
    )
    topics, probs = model.fit_transform(docs)
    return model, topics, probs


In [None]:
dataset = en_conll
model, topics, probs = run_bert_topic([" ".join(d) for d in dataset], language='en')

In [6]:
model.visualize_barchart()

In [8]:
model.visualize_hierarchy()

In [9]:
model.visualize_heatmap()

In [15]:
dataset = urdu_conll
model, topics, probs = run_bert_topic([" ".join(d) for d in dataset])

2025-07-04 23:25:28,027 - BERTopic - Embedding - Transforming documents to embeddings.
No sentence-transformers model found with name google-bert/bert-base-multilingual-cased. Creating a new one with mean pooling.


Batches:   0%|          | 0/161 [00:00<?, ?it/s]

2025-07-04 23:25:33,003 - BERTopic - Embedding - Completed ✓
2025-07-04 23:25:33,005 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-04 23:25:35,730 - BERTopic - Dimensionality - Completed ✓
2025-07-04 23:25:35,732 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-04 23:25:37,029 - BERTopic - Cluster - Completed ✓
2025-07-04 23:25:37,035 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 23:25:37,347 - BERTopic - Representation - Completed ✓


In [16]:
model.visualize_barchart()

In [17]:
model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1473,-1_مےں_ما_کہنا_پرنا,"[مےں, ما, کہنا, پرنا, کیا]",[سینا پینا ائی ایم الیکشن کمیشن روبرو اےک درخو...
1,0,115,0_نہا_سکنا_کہنا_ما,"[نہا, سکنا, کہنا, ما, چاہنا]",[ما علاقہ ما عسکریت پسند عام شہری اپیل کہنا ما...
2,1,108,1_کہنا_کہنا کہنا_انہا_لینا,"[کہنا, کہنا کہنا, انہا, لینا, ما]",[انٹرویو انھا ازراہ مذاق مسکرانا کہنا کہنا پرب...
3,2,96,2_ٹیم_کھلاڑی_کرکٹ_کپ,"[ٹیم, کھلاڑی, کرکٹ, کپ, مقابلہ]",[واضح سری لنکائی کرکٹ بورڈ ورلڈ کپ اختتم پرنا ...
4,3,93,3_کیا_رپورٹ_کانگریس_کہنا,"[کیا, رپورٹ, کانگریس, کہنا, پرنا]",[ترنمول کانگریس سربراہ ممت بنرجی الزام عائد کی...
5,4,89,4_محمد_مولانا_سید_ڈاکٹر,"[محمد, مولانا, سید, ڈاکٹر, مسجد]",[مولانا محمد خواجہ محی الدین معتمد بموجب تحریک...
6,5,88,5_نہا_ما_کہنا ما_کہنا,"[نہا, ما, کہنا ما, کہنا, ما ما]",[وقت کہنا امت مسلمہ ما بھٹکنا انسانیت صحیح راہ...
7,6,84,6_مےں_ہےں_کہنا_نہا,"[مےں, ہےں, کہنا, نہا, اےک]",[سپریم کورٹ اےک تنظیم درخواست پرنا ما ریمارک ک...
8,7,84,7_گندھاری_اےک_کیا_مےں,"[گندھاری, اےک, کیا, مےں, کہنا]",[اےک عینی شاہد دھماکہ وقت کیفہ مےں بچنا کہنا ک...
9,8,80,8_اقلیت_ما_مشاعرہ_کہنا,"[اقلیت, ما, مشاعرہ, کہنا, اضافہ]",[مےں گزشتہ کالم مےں ما به اظہار کیا کہنا فرقہ ...


In [None]:
[" ".join(d) for d in dataset]

In [25]:
dataset = urdu_marsiya
model, topics, probs = run_bert_topic([" ".join(d) for d in dataset])

2025-07-04 23:29:54,391 - BERTopic - Embedding - Transforming documents to embeddings.
No sentence-transformers model found with name google-bert/bert-base-multilingual-cased. Creating a new one with mean pooling.


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2025-07-04 23:29:57,996 - BERTopic - Embedding - Completed ✓
2025-07-04 23:29:57,997 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-04 23:29:58,167 - BERTopic - Dimensionality - Completed ✓
2025-07-04 23:29:58,168 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-04 23:29:58,182 - BERTopic - Cluster - Completed ✓
2025-07-04 23:29:58,185 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-04 23:30:00,922 - BERTopic - Representation - Completed ✓


In [26]:
model.visualize_barchart()

In [27]:
model.visualize_hierarchy()

In [None]:
model, topics, probs = run_bert_topic(urdu_news_docs)

2025-07-04 23:47:35,652 - BERTopic - Embedding - Transforming documents to embeddings.
No sentence-transformers model found with name google-bert/bert-base-multilingual-cased. Creating a new one with mean pooling.


Batches:   0%|          | 0/3496 [00:00<?, ?it/s]