# Islamic Fatwa Hybrid Search — HOML End-to-End
**TF-IDF + AraBERT · MRR@10 · Multi-Dataset · Flask + ngrok UI**

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install kagglehub sentence-transformers scikit-learn \
           pyarabic matplotlib seaborn scipy flask pyngrok flask-cors -q

In [None]:
import os, re, math, pickle, shutil, threading, time
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns, torch
import pyarabic.araby as araby

from sklearn.base                    import BaseEstimator, TransformerMixin
from sklearn.pipeline                import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise        import cosine_similarity
from sklearn.model_selection         import train_test_split
from scipy.sparse                    import save_npz
from sentence_transformers           import SentenceTransformer, InputExample, losses
from torch.utils.data                import DataLoader
import kagglehub

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
for d in ["data/raw","data/processed","models/tfidf","models/bert","reports/figures","templates"]:
    os.makedirs(d, exist_ok=True)
print(f"✅ Imports ready  |  device={DEVICE}")

## Dataset Registry — Switch Datasets Here

Change `ACTIVE_DATASET` to any key below, then **Run All**.  
All downstream cells adapt automatically.

| Key | Source | Rows | Columns |
|-----|--------|------|---------|
| `islamweb` | `abdallahelsaadany/fatawa` (IslamWeb) | ~83K | `title, ques, ans` |
| `50k_mixed` | `hazemmosalah/50k-islamic-fatwa-q-and-a-dataset-arabic` | ~51K | `question, answer` |
| `binbaz` | `a5medashraf/bin-baz-fatwas-dataset` | ~7K | `Questions, Answers` |


In [None]:
# ══════════════════════════════════════════════════
# switch datasets
ACTIVE_DATASET = "islamweb"
# ══════════════════════════════════════════════════

DATASETS = {
    "islamweb": {
        "kaggle_id"  : "abdallahelsaadany/fatawa",
        "col_title"  : "title",     # optional title column (or None)
        "col_q"      : "ques",      # question column
        "col_a"      : "ans",       # answer column
        "label"      : "IslamWeb Fatawa (~83K)",
    },
    "50k_mixed": {
        "kaggle_id"  : "hazemmosalah/50k-islamic-fatwa-q-and-a-dataset-arabic",
        "col_title"  : None,
        "col_q"      : "question",
        "col_a"      : "answer",
        "label"      : "50K Mixed Fatawa (Bin Baz + IslamQA + IslamWeb)",
    },
    "binbaz": {
        "kaggle_id"  : "a5medashraf/bin-baz-fatwas-dataset",
        "col_title"  : None,
        "col_q"      : "Questions",
        "col_a"      : "Answers",
        "label"      : "Bin Baz Fatwas (~7K)",
    },
}

DS   = DATASETS[ACTIVE_DATASET]
CFG  = dict(
    sample_n  = 15_000,  test_size = 0.10,  seed      = 42,
    min_q     = 5,       min_a     = 10,
    tfidf_q_max=20_000,  tfidf_d_max=15_000,
    ngram     = (1,4),   min_df    = 2,
    tfidf_q_w = 0.25,    tfidf_d_w = 0.75,
    hybrid_t  = 0.50,    hybrid_b  = 0.50,
    bert_name = "aubmindlab/bert-base-arabertv02",
    bert_batch= 128,     ft_batch  = 16,
    ft_epochs = 1,       ft_n      = 2_000, warmup = 0.10,
    top_k     = 3,       mrr_k     = 10,    mrr_n  = 500,
)

print(f"Active dataset : {DS['label']}")
print(f"   Columns       : title={DS['col_title']}  q={DS['col_q']}  a={DS['col_a']}")

## Big Picture
- **Task:** Arabic fatwa retrieval · **Metric:** MRR@10 on held-out test set
- **Architecture:** 50% TF-IDF + 50% AraBERT cosine similarity
- **Anti-leakage:** index built on `df_train` only; MRR measured on `df_test`

## Geting the Data

In [None]:
def load_dataset(ds_cfg):
    """Download from Kaggle, normalise to standard columns: question, answer, title."""
    path     = kagglehub.dataset_download("abdallahelsaadany/fatawa")
    csv_file = next(
        os.path.join(r,f) for r,_,fs in os.walk(path) for f in fs if f.endswith(".csv")
    )
    df = pd.read_csv(csv_file)
    df = df.rename(columns={
        ds_cfg["col_q"]: "question",
        ds_cfg["col_a"]: "answer",
    })
    if ds_cfg["col_title"] and ds_cfg["col_title"] in df.columns:
        df = df.rename(columns={ds_cfg["col_title"]: "title"})
    else:
        df["title"] = ""

    df = df[["question","answer","title"]].copy()
    df["question"] = df["question"].fillna("")
    df["answer"]   = df["answer"].fillna("")
    df["title"]    = df["title"].fillna("")
    return df

df_raw = load_dataset(DS)
print(f"Loaded: {df_raw.shape}  from  [{DS['label']}]")
print(df_raw.head(3).to_string())

## Explore the Data

In [None]:
eda = df_raw.assign(
    q_chars = df_raw.question.str.len(),
    a_chars = df_raw.answer.str.len(),
    q_words = df_raw.question.str.split().str.len(),
    a_words = df_raw.answer.str.split().str.len(),
)
print(eda[["q_chars","a_chars","q_words","a_words"]].describe().round(1))

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(13, 7))
fig.suptitle(f"EDA — {DS['label']}", fontsize=13, fontweight="bold")

for ax,(col,title,color,clip) in zip(axes.flat, [
    ("q_chars","Question Char Length","steelblue",    500),
    ("a_chars","Answer Char Length",  "darkorange",  2000),
    ("q_words","Question Word Count", "seagreen",     100),
    ("a_words","Answer Word Count",   "mediumpurple", 400),
]):
    ax.hist(eda[col].clip(0,clip), bins=50, color=color, edgecolor="white")
    ax.set_title(title)
    ax.axvline(eda[col].median(), color="red", ls="--", label=f"Median={eda[col].median():.0f}")
    ax.legend(fontsize=8)

plt.tight_layout()
fig_path = f"reports/figures/eda_{ACTIVE_DATASET}.png"
plt.savefig(fig_path, dpi=150); plt.show(); print(f"Saved → {fig_path}")

## Prepare the Data

In [None]:
class ArabicCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):   return [self._c(t) for t in X]
    @staticmethod
    def _c(t):
        if pd.isna(t) or not str(t).strip(): return ""
        t = araby.strip_tashkeel(araby.strip_tatweel(str(t)))
        return " ".join(re.sub(r"[^\u0600-\u06FF\s]"," ",t).split())

class DFCleaner(BaseEstimator, TransformerMixin):
    _c = ArabicCleaner()
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        X["question_clean"] = self._c.transform(X["question"])
        X["answer_clean"]   = self._c.transform(X["answer"])
        X["title_clean"]    = self._c.transform(X["title"])
        # doc = title + question (x3 weight) + answer  for richer TF-IDF
        X["doc"] = (X["title_clean"]+" ") + (X["question_clean"]+" ")*3 + X["answer_clean"]
        return X[
            (X["question_clean"].str.len() > CFG["min_q"]) &
            (X["answer_clean"].str.len()   > CFG["min_a"])
        ].reset_index(drop=True)

df_clean = DFCleaner().fit_transform(df_raw)
if len(df_clean) > CFG["sample_n"]:
    df_clean = df_clean.sample(CFG["sample_n"], random_state=CFG["seed"]).reset_index(drop=True)

df_train, df_test = train_test_split(
    df_clean, test_size=CFG["test_size"], random_state=CFG["seed"], shuffle=True
)
df_train = df_train.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)

df_train.to_csv(f"data/processed/{ACTIVE_DATASET}_train.csv", index=False)
df_test.to_csv( f"data/processed/{ACTIVE_DATASET}_test.csv",  index=False)
print(f"Train: {len(df_train):,}  |  Test: {len(df_test):,}   (no leakage)")

## Train & Evaluate (Baseline)

In [None]:
# TF-IDF
q_vec = TfidfVectorizer(max_features=CFG["tfidf_q_max"], ngram_range=CFG["ngram"], min_df=CFG["min_df"])
dv    = TfidfVectorizer(max_features=CFG["tfidf_d_max"], ngram_range=CFG["ngram"], min_df=CFG["min_df"])
q_mat = q_vec.fit_transform(df_train["question_clean"])
d_mat = dv.fit_transform(df_train["doc"])
print(f"TF-IDF  Q:{q_mat.shape}  Doc:{d_mat.shape}")

tfidf_dir = f"models/tfidf/{ACTIVE_DATASET}"
os.makedirs(tfidf_dir, exist_ok=True)
for name,obj in [("q_vec",q_vec),("dv",dv)]:
    with open(f"{tfidf_dir}/{name}.pkl","wb") as f: pickle.dump(obj,f)
save_npz(f"{tfidf_dir}/q_mat.npz", q_mat)
save_npz(f"{tfidf_dir}/d_mat.npz", d_mat)

# BERT baseline embeddings — TRAIN corpus only
bert = SentenceTransformer(CFG["bert_name"], device=DEVICE)
emb  = bert.encode(df_train["question_clean"].tolist(),
                   batch_size=CFG["bert_batch"], show_progress_bar=True, device=DEVICE)
bert_dir = f"models/bert/{ACTIVE_DATASET}"
os.makedirs(bert_dir, exist_ok=True)
np.save(f"{bert_dir}/embeddings.npy", emb)
print(f"Embeddings: {emb.shape}  ")

In [None]:
def _norm(s):
    mn,mx = s.min(),s.max()
    return np.zeros_like(s) if mx-mn<1e-8 else (s-mn)/(mx-mn+1e-8)

_ac = ArabicCleaner()

class Engine:
    def __init__(self, df, q_vec, q_mat, dv, d_mat, bert, emb, device):
        self.df=df.reset_index(drop=True); self.q_vec=q_vec; self.q_mat=q_mat
        self.dv=dv; self.d_mat=d_mat; self.bert=bert; self.emb=emb; self.device=device

    def search(self, q, top_k=3):
        c = _ac._c(q)
        if len(c)<3: return [{"error":"Query too short"}]
        tfidf = (CFG["tfidf_q_w"]*cosine_similarity(self.q_vec.transform([c]),self.q_mat).flatten() +
                 CFG["tfidf_d_w"]*cosine_similarity(self.dv.transform([c]),   self.d_mat).flatten())
        bs    = cosine_similarity(self.bert.encode([c],device=self.device), self.emb).flatten()
        score = CFG["hybrid_t"]*_norm(tfidf) + CFG["hybrid_b"]*_norm(bs)
        return [{"question" : self.df.iloc[i]["question"],
                 "answer"   : self.df.iloc[i]["answer"],
                 "title"    : self.df.iloc[i].get("title",""),
                 "confidence":f"{score[i]*100:.1f}%",
                 "tfidf"    :f"{_norm(tfidf)[i]*100:.1f}%",
                 "bert"     :f"{_norm(bs)[i]*100:.1f}%",
                 "_idx"     : int(i)}
                for i in score.argsort()[-top_k:][::-1]]

def mrr(df_corpus, df_eval, engine, k=10, n=500, seed=99):
    """MRR@K — evaluated on held-out df_eval (no leakage)."""
    ans_idx = {r["answer_clean"][:80]:i for i,r in df_corpus.iterrows()}
    sample  = df_eval.sample(min(n,len(df_eval)), random_state=seed)
    rrs, hits = [], 0
    for _,row in sample.iterrows():
        gt = ans_idx.get(row["answer_clean"][:80])
        rr = 0.0
        if gt is not None:
            for rank,res in enumerate(engine.search(row["question_clean"],k),1):
                if res["_idx"]==gt: rr=1/rank; hits+=1; break
        rrs.append(rr)
    n_=len(rrs); print(f"  Hits@{k}: {hits}/{n_} ({hits/n_*100:.1f}%)")
    return float(np.mean(rrs))

eng = Engine(df_train, q_vec, q_mat, dv, d_mat, bert, emb, DEVICE)
print(f"Baseline MRR@{CFG['mrr_k']} — {DS['label']}")
mrr_before = mrr(df_train, df_test, eng, CFG["mrr_k"], CFG["mrr_n"])
print(f"\nBaseline MRR@{CFG['mrr_k']} = {mrr_before:.4f}  ({mrr_before*100:.2f}%)")

## Fine-Tune AraBERT (MNRL)

In [None]:
examples = [InputExample(texts=[r["question_clean"],r["answer_clean"]])
             for _,r in df_train.sample(CFG["ft_n"],random_state=CFG["seed"]).iterrows()]
loader   = DataLoader(examples, shuffle=True, batch_size=CFG["ft_batch"])
warmup   = math.ceil(len(loader)*CFG["ft_epochs"]*CFG["warmup"])
ft_path  = f"models/bert/{ACTIVE_DATASET}/finetuned"

print(f"Fine-tuning  n={CFG['ft_n']}  batch={CFG['ft_batch']}  warmup={warmup}")
bert.fit(train_objectives=[(loader, losses.MultipleNegativesRankingLoss(bert))],
         epochs=CFG["ft_epochs"], warmup_steps=warmup,
         show_progress_bar=True, output_path=ft_path)

emb = bert.encode(df_train["question_clean"].tolist(),
                  batch_size=CFG["bert_batch"], show_progress_bar=True, device=DEVICE)
np.save(f"models/bert/{ACTIVE_DATASET}/embeddings.npy", emb)

eng_ft     = Engine(df_train, q_vec, q_mat, dv, d_mat, bert, emb, DEVICE)
mrr_after  = mrr(df_train, df_test, eng_ft, CFG["mrr_k"], CFG["mrr_n"])
print(f"\n Fine-Tuned MRR@{CFG['mrr_k']} = {mrr_after:.4f}  ({mrr_after*100:.2f}%)")

## Present the Solution

In [None]:
delta = mrr_after - mrr_before
print("="*55)
print(f"  Dataset    : {DS['label']}")
print(f"  Baseline   MRR@{CFG['mrr_k']} : {mrr_before:.4f}  ({mrr_before*100:.2f}%)")
print(f"  Fine-Tuned MRR@{CFG['mrr_k']} : {mrr_after:.4f}  ({mrr_after*100:.2f}%)")
print(f"  Δ MRR                 : {delta:+.4f}  ({'✅ improvement' if delta>0 else '❌ no improvement'})")
print("="*55)

fig, ax = plt.subplots(figsize=(5,4))
bars = ax.bar(["Baseline","Fine-Tuned"],[mrr_before,mrr_after],
              color=["steelblue","seagreen"], width=0.4, edgecolor="white")
ax.set_ylim(0,1); ax.set_ylabel(f"MRR@{CFG['mrr_k']}")
ax.set_title(f"MRR@{CFG['mrr_k']} — {DS['label']}", fontweight="bold", fontsize=9)
for b,v in zip(bars,[mrr_before,mrr_after]):
    ax.text(b.get_x()+b.get_width()/2, v+0.02, f"{v:.4f}", ha="center", fontweight="bold")
plt.tight_layout()
plt.savefig(f"reports/figures/mrr_{ACTIVE_DATASET}.png", dpi=150); plt.show()

In [None]:
for q in ["حكم صلاة الجمعة","ما هو الزكاة","هل يجوز الصيام في السفر"]:
    print(f"\nSearch {q}")
    for r in eng_ft.search(q, CFG["top_k"]):
        title = f'[{r["title"][:40]}] ' if r["title"] else ""
        print(f"  {title}[{r['confidence']}] TF:{r['tfidf']} BERT:{r['bert']}")
        print(f"  Q: {r['question'][:80]}...")
        print(f"  A: {r['answer'][:100]}...")

## Launch: Flask + ngrok UI

*  Laptop: paste URL into `laptop_server.py` → `python laptop_server.py` → open `http://localhost:8080`



In [None]:
# Cell A — Write laptop_server.py (run this on your LAPTOP)
srv = '''from flask import Flask, render_template, request, jsonify
import requests

app = Flask(__name__, template_folder="templates")
COLAB_URL = ""

@app.route("/")
def home(): return render_template("index.html")

@app.route("/ask", methods=["POST"])
def ask():
    d = request.get_json()
    q = d.get("question","").strip()
    if not q: return jsonify({"error":"Empty question"}), 400
    try:
        r = requests.post(f"{COLAB_URL}/ask",
                          json={"question":q,"top_k":d.get("top_k",3)}, timeout=60)
        return jsonify(r.json()) if r.ok else jsonify({"error":f"Colab {r.status_code}"}), 500
    except requests.ConnectionError:
        return jsonify({"error":"Cannot reach Colab — is Cell B running?"}), 500
    except requests.Timeout:
        return jsonify({"error":"Colab timeout"}), 500

@app.route("/health")
def health():
    try:
        r = requests.get(f"{COLAB_URL}/health", timeout=5)
        return jsonify({"status":"connected","colab":r.json()}) if r.ok \
               else jsonify({"status":"disconnected"}), 500
    except: return jsonify({"status":"disconnected"}), 500

@app.route("/update_url", methods=["POST"])
def update_url():
    global COLAB_URL
    COLAB_URL = request.get_json().get("url","").strip().rstrip("/")
    return jsonify({"status":"ok","url":COLAB_URL})

if __name__ == "__main__":
    print(f"UI → http://localhost:8080  |  Colab: {COLAB_URL}")
    app.run(host="0.0.0.0", port=8080, debug=True)
'''
with open("laptop_server.py","w") as f: f.write(srv)
print(" laptop_server.py written")

In [None]:
# Cell A — Start Colab Flask API + ngrok
import threading
import time
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok

ngrok.kill()

# https://dashboard.ngrok.com/get-started/your-authtoken
NGROK_TOKEN = ""
ngrok.set_auth_token(NGROK_TOKEN)

api = Flask(__name__)
CORS(api)

@api.route("/health")
def health():
    return jsonify({"status":"ok", "device":DEVICE})

@api.route("/ask", methods=["POST"])
def ask():
    d = request.get_json(force=True)
    q = d.get("question","").strip()
    if len(q)<2: return jsonify({"error":"Query too short"}), 400

    # Assuming 'eng_ft' is your search engine object defined in a previous cell
    raw = eng_ft.search(q, int(d.get("top_k",3)))
    return jsonify({
        "analysis": {"intent":q},
        "results" : [{"id":i+1, **r} for i,r in enumerate(raw)],
    })
PORT = 5001

threading.Thread(target=lambda: api.run(host="0.0.0.0", port=PORT,
                                        use_reloader=False, debug=False),
                 daemon=True).start()

time.sleep(2)


url = ngrok.connect(PORT).public_url
print(f"\n Colab API live → {url}")
print(f" 1. Edit laptop_server.py → COLAB_URL = '{url}'")

In [None]:
# Smoke test
import requests as req
print("Health:", req.get("http://localhost:5000/health").json())
r = req.post("http://localhost:5000/ask",json={"question":"حكم صلاة الجمعة","top_k":2}).json()
for res in r.get("results",[]):
    print(f"  [{res['confidence']}] {res['title'][:50] or res['question'][:50]}...")

In [None]:
# Stop ngrok
from pyngrok import ngrok; ngrok.kill(); print("Tunnel closed")