*Installing Dependencies*

In [1]:
pip install pandas urduhack nltk bertopic umap-learn hdbscan matplotlib wordcloud


Collecting urduhack
  Downloading urduhack-1.1.1-py3-none-any.whl.metadata (7.2 kB)
Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting tf2crf (from urduhack)
  Downloading tf2crf-0.1.33-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-datasets~=3.1 (from urduhack)
  Downloading tensorflow_datasets-3.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting Click~=7.1 (from urduhack)
  Downloading click-7.1.2-py2.py3-none-any.whl.metadata (2.9 kB)
INFO: pip is looking at multiple versions of tf2crf to determine which version is compatible with other requirements. This could take a while.
Collecting tf2crf (from urduhack)
  Downloading tf2crf-0.1.32-py2.py3-none-any.whl.metadata (1.9 kB)
  Downloading tf2crf-0.1.31-py2.py3-none-any.whl.metadata (1.9 kB)
  Downloading tf2crf-0.1.30-py2.py3-none-any.whl.metadata (1.9 kB)
  Downloading tf2crf-0.1.29-py2.py3-none-any.whl.metadata (3.6 kB)
  Downloading tf2crf-0.1.28-py2.py3-none-any.whl.metadat

*Data Reading*

In [2]:
import pandas as pd

data = []
with open("ur.kanzuliman.txt", "r", encoding="utf-8") as f:
    for line in f:
        # Split the line by '|' and check if it has 3 parts
        parts = line.strip().split("|")
        if len(parts) == 3:
            surah, ayah, text = parts
            data.append([int(surah), int(ayah), text])
        else:
            # Optionally, print a warning or log the problematic line
            print(f"Skipping line with unexpected format: {line.strip()}")


df = pd.DataFrame(data, columns=["Surah", "Ayah", "Text"])
print(df.head())

Skipping line with unexpected format: 
Skipping line with unexpected format: 
Skipping line with unexpected format: # --------------------------------------------------------------------
Skipping line with unexpected format: #
Skipping line with unexpected format: #  Quran Translation
Skipping line with unexpected format: #  Name: احمد رضا خان
Skipping line with unexpected format: #  Translator: Ahmed Raza Khan
Skipping line with unexpected format: #  Language: Urdu
Skipping line with unexpected format: #  ID: ur.kanzuliman
Skipping line with unexpected format: #  Last Update: March 17, 2011
Skipping line with unexpected format: #  Source: Tanzil.net
Skipping line with unexpected format: #
Skipping line with unexpected format: # --------------------------------------------------------------------
   Surah  Ayah                                           Text
0      1     1    اللہ کے نام سے شروع جو بہت مہربان رحمت والا
1      1     2  سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا،
2     

*Full View*

In [4]:
df

Unnamed: 0,Surah,Ayah,Text
0,1,1,اللہ کے نام سے شروع جو بہت مہربان رحمت والا
1,1,2,سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا،
2,1,3,بہت مہربان رحمت والا،
3,1,4,روز جزا کا مالک،
4,1,5,ہم تجھی کو پوجیں اور تجھی سے مدد چاہیں،
...,...,...,...
6231,114,2,سب لوگوں کا بادشاہ
6232,114,3,سب لوگوں کا خدا
6233,114,4,اس کے شر سے جو دل میں برے خطرے ڈالے اور دبک رہے
6234,114,5,وہ جو لوگوں کے دلوں میں وسوسے ڈالتے ہیں،


*Searching*

In [5]:
df[df['Surah']==2]

Unnamed: 0,Surah,Ayah,Text
7,2,1,الم
8,2,2,وہ بلند رتبہ کتاب (قرآن) کوئی شک کی جگہ نہیں، ...
9,2,3,وہ جو بے دیکھے ایمان لائیں اور نماز قائم رکھیں...
10,2,4,اور وہ کہ ایمان لائیں اس پر جو اے محبوب تمہاری...
11,2,5,وہی لوگ اپنے رب کی طرف سے ہدایت پر ہیں اور وہی...
...,...,...,...
288,2,282,اے ایمان والو! جب تم ایک مقرر مدت تک کسی دین ک...
289,2,283,اور اگر تم سفر میں ہو اور لکھنے والا نہ پاؤ تو...
290,2,284,اللہ ہی کا ہے جو کچھ آسمانوں میں ہے اور جو کچھ...
291,2,285,سب نے مانا اللہ اور اس کے فرشتوں اور اس کی کتا...


*Stop-Words Removal*

In [3]:
import re



# Urdu stopwords (custom list bana sakhte ho)
urdu_stopwords = ["آئی","آئے","آج","آخر","آخرکبر","آدهی","آًب","آٹھ","آیب","اة","اخبزت","اختتبم","ادھر","ارد","اردگرد","ارکبى","اش","اضتعوبل","اضتعوبلات","اضطرذ","اضکب","اضکی","اضکے","اطراف","اغیب","افراد","الگ","اور","اوًچب","اوًچبئی","اوًچی","اوًچے","اى","اً","اًذر","اًہیں","اٹھبًب","اپٌب","اپٌے","اچھب","اچھی","اچھے","اکثر","اکٹھب","اکٹھی","اکٹھے","اکیلا","اکیلی","اکیلے","اگرچہ","اہن","ایطے","ایک","ب","ت","تبزٍ","تت","تر","ترتیت","تریي","تعذاد","تن","تو","توبم","توہی","توہیں","تٌہب","تک","تھب","تھوڑا","تھوڑی","تھوڑے","تھی","تھے","تیي","ثب","ثبئیں","ثبترتیت","ثبری","ثبرے","ثبعث","ثبلا","ثبلترتیت","ثبہر","ثدبئے","ثرآں","ثراں","ثرش","ثعذ","ثغیر","ثلٌذ","ثلٌذوثبلا","ثلکہ","ثي","ثٌب","ثٌبرہب","ثٌبرہی","ثٌبرہے","ثٌبًب","ثٌذ","ثٌذکرو","ثٌذکرًب","ثٌذی","ثڑا","ثڑوں","ثڑی","ثڑے","ثھر","ثھرا","ثھراہوا","ثھرپور","ثھی","ثہت","ثہتر","ثہتری","ثہتریي","ثیچ","ج","خب","خبرہب","خبرہی","خبرہے","خبهوظ","خبًب","خبًتب","خبًتی","خبًتے","خبًٌب","خت","ختن","خجکہ","خص","خططرذ","خلذی","خو","خواى","خوًہی","خوکہ","خٌبة","خگہ","خگہوں","خگہیں","خیطب","خیطبکہ","در","درخبت","درخہ","درخے","درزقیقت","درضت","دش","دفعہ","دلچطپ","دلچطپی","دلچطپیبں","دو","دور","دوراى","دوضرا","دوضروں","دوضری","دوضرے","دوًوں","دکھبئیں","دکھبتب","دکھبتی","دکھبتے","دکھبو","دکھبًب","دکھبیب","دی","دیب","دیتب","دیتی","دیتے","دیر","دیٌب","دیکھو","دیکھٌب","دیکھی","دیکھیں","دے","ر","راضتوں","راضتہ","راضتے","رریعہ","رریعے","رکي","رکھ","رکھب","رکھتب","رکھتبہوں","رکھتی","رکھتے","رکھی","رکھے","رہب","رہی","رہے","ز","زبصل","زبضر","زبل","زبلات","زبلیہ","زصوں","زصہ","زصے","زقبئق","زقیتیں","زقیقت","زکن","زکویہ","زیبدٍ","صبف","صسیر","صفر","صورت","صورتسبل","صورتوں","صورتیں","ض","ضبت","ضبتھ","ضبدٍ","ضبرا","ضبرے","ضبل","ضبلوں","ضت","ضرور","ضرورت","ضروری","ضلطلہ","ضوچ","ضوچب","ضوچتب","ضوچتی","ضوچتے","ضوچو","ضوچٌب","ضوچی","ضوچیں","ضکب","ضکتب","ضکتی","ضکتے","ضکٌب","ضکی","ضکے","ضیذھب","ضیذھی","ضیذھے","ضیکٌڈ","ضے","طرف","طریق","طریقوں","طریقہ","طریقے","طور","طورپر","ظبہر","ع","عذد","عظین","علاقوں","علاقہ","علاقے","علاوٍ","عووهی","غبیذ","غخص","غذ","غروع","غروعبت","غے","فرد","فی","ق","قجل","قجیلہ","قطن","لئے","لا","لازهی","لو","لوجب","لوجی","لوجے","لوسبت","لوسہ","لوگ","لوگوں","لڑکپي","لگتب","لگتی","لگتے","لگٌب","لگی","لگیں","لگے","لی","لیب","لیٌب","لیں","لے","ه","هتعلق","هختلف","هسترم","هسترهہ","هسطوش","هسیذ","هطئلہ","هطئلے","هطبئل","هطتعول","هطلق","هعلوم","هػتول","هلا","هوکي","هوکٌبت","هوکٌہ","هٌبضت","هڑا","هڑًب","هڑے","هکول","هگر","هہرثبى","هیرا","هیری","هیرے","هیں","و","وار","والے","وٍ","ًئی","ًئے","ًب","ًبپطٌذ","ًبگسیر","ًطجت","ًقطہ","ًو","ًوخواى","ًکبلٌب","ًکتہ","ًہ","ًہیں","ًیب","ًے","ٓ آش","ٹھیک","پبئے","پبش","پبًب","پبًچ","پر","پراًب","پطٌذ","پل","پورا","پوچھب","پوچھتب","پوچھتی","پوچھتے","پوچھو","پوچھوں","پوچھٌب","پوچھیں","پچھلا","پھر","پہلا","پہلی","پہلےضی","پہلےضے","پہلےضےہی","پیع","چبر","چبہب","چبہٌب","چبہے","چلا","چلو","چلیں","چلے","چکب","چکی","چکیں","چکے","چھوٹب","چھوٹوں","چھوٹی","چھوٹے","چھہ","چیسیں","ڈھوًڈا","ڈھوًڈلیب","ڈھوًڈو","ڈھوًڈًب","ڈھوًڈی","ڈھوًڈیں","ک","کئی","کئے","کب","کبفی","کبم","کت","کجھی","کرا","کرتب","کرتبہوں","کرتی","کرتے","کرتےہو","کررہب","کررہی","کررہے","کرو","کرًب","کریں","کرے","کطی","کل","کن","کوئی","کوتر","کورا","کوروں","کورٍ","کورے","کوطي","کوى","کوًطب","کوًطی","کوًطے","کھولا","کھولو","کھولٌب","کھولی","کھولیں","کھولے","کہ","کہب","کہتب","کہتی","کہتے","کہو","کہوں","کہٌب","کہی","کہیں","کہے","کی","کیب","کیطب","کیطرف","کیطے","کیلئے","کیوًکہ","کیوں","کیے","کے","کےثعذ","کےرریعے","گئی","گئے","گب","گرد","گروٍ","گروپ","گروہوں","گٌتی","گی","گیب","گے","ہر","ہن","ہو","ہوئی","ہوئے","ہوا","ہوبرا","ہوبری","ہوبرے","ہوتب","ہوتی","ہوتے","ہورہب","ہورہی","ہورہے","ہوضکتب","ہوضکتی","ہوضکتے","ہوًب","ہوًی","ہوًے","ہوچکب","ہوچکی","ہوچکے","ہوگئی","ہوگئے","ہوگیب","ہوں","ہی","ہیں","ہے","ی","یقیٌی","یہ","یہبں"]


def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)   # remove punctuation
    text = text.replace("ي", "ی").replace("ہ", "ہ")  # normalization
    tokens = text.split()
    tokens = [word for word in tokens if word not in urdu_stopwords]
    return " ".join(tokens)

df["Cleaned"] = df["Text"].apply(clean_text)
print(df.head())


   Surah  Ayah                                           Text  \
0      1     1    اللہ کے نام سے شروع جو بہت مہربان رحمت والا   
1      1     2  سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا،   
2      1     3                          بہت مہربان رحمت والا،   
3      1     4                               روز جزا کا مالک،   
4      1     5        ہم تجھی کو پوجیں اور تجھی سے مدد چاہیں،   

                                        Cleaned  
0      اللہ نام سے شروع جو بہت مہربان رحمت والا  
1  سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا  
2                          بہت مہربان رحمت والا  
3                               روز جزا کا مالک  
4            ہم تجھی کو پوجیں تجھی سے مدد چاہیں  


In [4]:
df

Unnamed: 0,Surah,Ayah,Text,Cleaned
0,1,1,اللہ کے نام سے شروع جو بہت مہربان رحمت والا,اللہ نام سے شروع جو بہت مہربان رحمت والا
1,1,2,سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا،,سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا
2,1,3,بہت مہربان رحمت والا،,بہت مہربان رحمت والا
3,1,4,روز جزا کا مالک،,روز جزا کا مالک
4,1,5,ہم تجھی کو پوجیں اور تجھی سے مدد چاہیں،,ہم تجھی کو پوجیں تجھی سے مدد چاہیں
...,...,...,...,...
6231,114,2,سب لوگوں کا بادشاہ,سب کا بادشاہ
6232,114,3,سب لوگوں کا خدا,سب کا خدا
6233,114,4,اس کے شر سے جو دل میں برے خطرے ڈالے اور دبک رہے,اس شر سے جو دل میں برے خطرے ڈالے دبک
6234,114,5,وہ جو لوگوں کے دلوں میں وسوسے ڈالتے ہیں،,وہ جو دلوں میں وسوسے ڈالتے


In [5]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


*Model Training*

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

docs = df["Cleaned"].tolist()
topic_model = BERTopic(language="multilingual")
topics, probs = topic_model.fit_transform(docs)

# Get topic info
print(topic_model.get_topic_info())


*Output Topics*

In [7]:
df["Topic"] = topics
print(df.groupby(["Surah",'Ayah', "Topic"]).size())


Surah  Ayah  Topic
1      1      36      1
       2     -1       1
       3      45      1
       4      16      1
       5     -1       1
                     ..
114    2      24      1
       3      24      1
       4     -1       1
       5     -1       1
       6     -1       1
Length: 6236, dtype: int64


*Topic Visualization*

In [8]:

topic_model.visualize_hierarchy()


In [9]:
topic_model.visualize_topics()


In [10]:
topic_model.visualize_barchart()


*UI-gradio*

In [11]:
!pip install gradio




In [21]:
import gradio as gr

def search_query(query):
    results = []
    for t in topic_model.get_topics().keys():
        if t == -1:  # skip outliers
            continue
        words = [w for w, _ in topic_model.get_topic(t)]
        if any(q in w for q in query.split() for w in words):
            matched = df[df["Topic"] == t]["Text"].tolist()[:5]
            results.extend(matched)
    return "\n\n".join(results) if results else "کوئی آیت نہیں ملی"

demo = gr.Interface(
    fn=search_query,
    inputs=gr.Textbox(lines=2, placeholder="سوال لکھیں..."),
    outputs=gr.Textbox(lines=15, label="نتائج")  # 👈 بڑا آؤٹ پٹ باکس
)

demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://054c3e8caa66f37391.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://054c3e8caa66f37391.gradio.live




In [30]:
import gradio as gr

def search_query(query):
    results = []
    for t in topic_model.get_topics().keys():
        if t == -1:
            continue
        words = [w for w, _ in topic_model.get_topic(t)]
        if any(q in w for q in query.split() for w in words):
            matched = df[df["Topic"] == t][["Surah", "Ayah", "Text"]].head(10)
            for _, row in matched.iterrows():
                results.append(f"سورہ {row['Surah']}، آیت {row['Ayah']}\n{row['Text']}\n")
    return "\n\n".join(results) if results else "کوئی آیت نہیں ملی"

demo = gr.Interface(
    fn=search_query,
    inputs=gr.Textbox(lines=2, placeholder="سوال لکھیں..."),
    outputs=gr.Textbox(lines=20, label="نتائج")  # بڑا ونڈو
)

demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://090220371ffcead65e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://090220371ffcead65e.gradio.live




In [17]:
df[(df['Surah']==2) & (df['Ayah']==163)]

Unnamed: 0,Surah,Ayah,Text,Cleaned,Topic
169,2,163,اور تمہارا معبود ایک معبود ہے اس کے سوا کوئی م...,تمہارا معبود معبود اس سوا معبود نہیں مگر وہی ب...,45


*Hybrid Approach*

In [25]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

# Store separately
embeddings = embedder.encode(df["Text"].tolist(), convert_to_tensor=False)

# Save in numpy array (6236 x 768)
embeddings = np.array(embeddings)


In [26]:
embeddings

array([[-0.00067547,  0.06736731, -0.01688584, ..., -0.03630129,
         0.03276282, -0.10146039],
       [-0.01465214,  0.03913351, -0.01865348, ...,  0.00537281,
        -0.00427388, -0.1022234 ],
       [-0.00086769,  0.05567692, -0.01886087, ..., -0.00973713,
         0.13453846, -0.09718464],
       ...,
       [ 0.02859404,  0.09870513, -0.01945981, ...,  0.03381292,
         0.00835356, -0.06763617],
       [ 0.011991  ,  0.17994003, -0.01901782, ...,  0.10867774,
         0.08355255, -0.01967195],
       [ 0.00825943,  0.10481553, -0.01653994, ...,  0.09239535,
         0.12011007, -0.02558146]], dtype=float32)

In [None]:
import gradio as gr
from sentence_transformers import util

import torch

def search_query(query, top_k=5):
    # Step 1: Find best matching topic using BERTopic words
    candidate_idx = []
    for t in topic_model.get_topics().keys():
        if t == -1:
            continue
        words = [w for w, _ in topic_model.get_topic(t)]
        if any(q in w for q in query.split() for w in words):
            candidate_idx.extend(df[df["Topic"] == t].index.tolist())

    # Step 2: If no topic matched, fallback to full Quran
    if not candidate_idx:
        candidate_idx = df.index.tolist()

    # Step 3: Rank inside selected cluster(s)
    query_emb = embedder.encode(query, convert_to_tensor=True).to("cuda")

    # Ensure candidate embeddings are tensor + GPU
    cand_embeddings = torch.tensor(embeddings[candidate_idx]).to("cuda")

    scores = util.cos_sim(query_emb, cand_embeddings)[0].cpu().numpy()

    # Attach scores back
    candidate_df = df.loc[candidate_idx].copy()
    candidate_df["Score"] = scores

    results = candidate_df.sort_values("Score", ascending=False).head(top_k)

    output = ""
    for _, row in results.iterrows():
        output += f"سورہ {row['Surah']}، آیت {row['Ayah']}\n{row['Text']}\n\n"

    return output if output else "کوئی آیت نہیں ملی"


# Gradio UI
demo = gr.Interface(
    fn=search_query,
    inputs=gr.Textbox(lines=2, placeholder="سوال لکھیں..."),
    outputs=gr.Textbox(lines=20, label="نتائج")
)

demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://cad65b885d1e5e00f9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
