In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#TFIDF

In [None]:


# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm
!pip install country_converter --upgrade
!pip install datefinder

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk, contractions
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import datefinder
import country_converter as coco
from datetime import datetime
from tqdm import tqdm

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_antique9.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_antique9.tsv"
CLEANED_TSV_PATH = "cleaned_docs_antique9.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_antique9.joblib"
Json_file = os.path.join(BASE_PATH, "qas.search.json")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessing = processing(text)
    tokenizing = tokenize(preprocessing)
    return tokenizing

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out="cleaned_docs_antique9.tsv"):
    cleaned_data = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
        doc_id = row["doc_id"]
        raw_text = row["text"]
        processed_tokens = clean_text(raw_text)

        if not processed_tokens:  # تجاهل المستندات التي أصبحت فارغة
            continue

        processed_text = " ".join(processed_tokens)
        light_text = light_clean(raw_text)

        cleaned_data.append({
            "doc_id": doc_id,
            "text": raw_text,
            "processed_text": processed_text,
            "dataset_name": dataset_name,
            "light_clean_text": light_text
        })

    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(path_out, sep="\t", index=False)


def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)
save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out="cleaned_docs_antique9.tsv")

def build_inverted_index_from_tsv(tsv_path=CLEANED_TSV_PATH):
    df = pd.read_csv(tsv_path, sep="\t")
    index = defaultdict(set)
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🔧 بناء الفهرس"):
        doc_id = str(row["doc_id"])
        tokens = str(row["processed_text"]).split()
        for token in tokens:
            index[token].add(doc_id)
    os.makedirs(os.path.dirname(INVERTED_PATH), exist_ok=True)
    with open(INVERTED_PATH, "w", encoding="utf-8") as f:
        json.dump({k: list(v) for k, v in index.items()}, f, ensure_ascii=False, indent=2)
    return index

inverted_index = build_inverted_index_from_tsv()

# def load_and_clean_queries(path):
#     data = []
#     with open(path, "r", encoding="utf-8") as f:
#         for line in f:
#             parts = line.strip().split("\t")
#             if len(parts) >= 2:
#                 query_id = parts[0]
#                 text = " ".join(parts[1:])
#                 data.append((query_id, text))
#     df = pd.DataFrame(data, columns=["query_id", "text"])
#     df["clean_text"] = df["text"].apply(clean_text)
#     with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
#         for _, row in df.iterrows():
#             f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")
#     return df


def load_and_clean_queries(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        next(f)  # تخطي السطر الأول (الرأس)
        for line in f:
            parts = line.strip().split("\t", maxsplit=1)
            if len(parts) == 2:
                query_id = parts[0]
                text = parts[1].replace('\t', ' ')  # لو كان النص فيه كلمات مفصولة بـ tab
                data.append((query_id, text))

    # إنشاء DataFrame
    df = pd.DataFrame(data, columns=["query_id", "text"])

    # تنظيف النص
    df["clean_text"] = df["text"].apply(clean_text)

    # حفظ النتائج إلى ملف جديد
    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")

    return df

queries_df = load_and_clean_queries(QUERIES_PATH)

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict



queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)

tqdm.pandas(desc=">> تجهيز النصوص")

def custom_preprocessor(text):
    return processing(text)

def custom_tokenizer(text):
    return tokenize(text)

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:

    vectorizer = TfidfVectorizer(
    lowercase=False,
    preprocessor=custom_preprocessor,
    tokenizer=custom_tokenizer,

    )


    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]

def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=100):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]

retrieved_docs_dict = {}
for i, (qid, tokens) in enumerate(tqdm(queries_tokens.items(), desc="🔍 استرجاع وترتيب المستندات"), start=1):
    retrieved = retrieve_docs(tokens, inverted_index)
    query_vec = represent_query(tokens)
    if query_vec is None:
      print("استعلام فارغ، تم تجاهله")
      continue
    ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
    retrieved_docs_dict[qid] = ranked

# def get_qrels(path=QRELS_PATH):
#     df = pd.read_csv(path, sep="\t")
#     df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
#     qrel_dict = defaultdict(dict)
#     relevant_set = defaultdict(set)
#     for _, row in df.iterrows():
#         qid = str(row["query_id"])
#         docid = str(row["doc_id"])
#         rel = row["relevance"]
#         qrel_dict[qid][docid] = rel
#         if rel > 0:
#             relevant_set[qid].add(docid)
#     return qrel_dict, relevant_set



def get_qrels(path=QRELS_PATH):
    # نقرأ الملف لكن نتعامل مع الأسطر الغلط ونتجاهلها
    try:
        df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], skiprows=1, engine='python')
    except Exception as e:
        print(f"⚠️ خطأ أثناء قراءة الملف: {e}")
        return {}, {}

    # حذف الأسطر اللي فيها أعمدة ناقصة أو فيها قيم ناقصة
    df = df.dropna(subset=["query_id", "doc_id", "relevance"])

    # تحويل العمود لقيم صحيحة والتعامل مع القيم الخاطئة
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)

    for _, row in df.iterrows():
        qid = str(row["query_id"]).strip()
        docid = str(row["doc_id"]).strip()
        rel = row["relevance"]

        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)

    return qrel_dict, relevant_set


def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(dataset_name="antique"):
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs_dict).items()}
    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, recall_scores = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, precision_scores = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

cal_evaluations("antique")










[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
🧼 تنظيف وتخزين المستندات: 100%|██████████| 403458/403458 [26:16<00:00, 255.93it/s]
🔧 بناء الفهرس: 100%|██████████| 400862/400862 [00:41<00:00, 9578.45it/s] 
🔍 استرجاع وترتيب المستندات: 100%|██████████| 200/200 [01:12<00:00,  2.77it/s]



== Evaluation for dataset: antique ==
MAP: 10.27 %
MRR: 76.37 %
Mean Precision: 40.05 %
Mean Recall: 12.84 %
Mean Precision@10: 38.4 %


#BM25

In [None]:
# تثبيت الحزم (تحتاج مرة واحدة فقط)
!pip install nltk==3.8.1 contractions scikit-learn tqdm rank_bm25

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk, contractions
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from tqdm import tqdm

# تحميل موارد NLTK (مرة واحدة)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات كما في كودك
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_antique.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_antique.tsv"
CLEANED_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/cleaned_docs_antique.tsv"
BM25_MODEL_PATH = f"/content/drive/MyDrive/utils/bm25_model/{DATASET_NAME}_bm25.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessing = processing(text)
    tokenizing = tokenize(preprocessing)
    return tokenizing

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH):
#     cleaned_data = []
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
#         doc_id = row["doc_id"]
#         raw_text = row["text"]
#         processed_tokens = clean_text(raw_text)
#         processed_text = " ".join(processed_tokens)
#         light_text = light_clean(raw_text)

#         cleaned_data.append({
#             "doc_id": doc_id,
#             "text": raw_text,
#             "processed_text": processed_text,
#             "dataset_name": dataset_name,
#             "light_clean_text": light_text
#         })

#     cleaned_df = pd.DataFrame(cleaned_data)
#     cleaned_df.to_csv(path_out, sep="\t", index=False)

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

# === بناء نموذج BM25 ===
def build_bm25_model(cleaned_docs_tsv=CLEANED_TSV_PATH, save_path=BM25_MODEL_PATH):
    df = pd.read_csv(cleaned_docs_tsv, sep="\t")
    df = df[df['processed_text'].notna() & (df['processed_text'].str.strip() != "")]

    corpus = [doc.split() for doc in df["processed_text"]]
    doc_ids = df["doc_id"].astype(str).tolist()

    bm25 = BM25Okapi(corpus)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    joblib.dump({"bm25": bm25, "doc_ids": doc_ids, "corpus": corpus}, save_path)
    print("تم بناء وحفظ نموذج BM25")

def load_bm25_model(path=BM25_MODEL_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError("نموذج BM25 غير موجود، قم ببنائه أولاً.")
    return joblib.load(path)

def clean_and_tokenize_query(raw_query):
    return clean_text(raw_query)

def retrieve_bm25_docs(query_tokens, bm25_data, top_k=10):
    bm25 = bm25_data["bm25"]
    doc_ids = bm25_data["doc_ids"]
    scores = bm25.get_scores(query_tokens)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    results = [(doc_ids[i], scores[i]) for i in ranked_indices]
    return results

# === تحميل وتنظيف البيانات ===
df = load_dataset(DOCS_PATH)
# save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH)

# === بناء نموذج BM25 ===
build_bm25_model(CLEANED_TSV_PATH, BM25_MODEL_PATH)

# === تحميل واستدعاء النموذج ===
bm25_data = load_bm25_model(BM25_MODEL_PATH)

# === تحميل وتنظيف الاستعلامات ===
def load_and_clean_queries(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                query_id = parts[0]
                text = " ".join(parts[1:])
                data.append((query_id, text))
    df = pd.DataFrame(data, columns=["query_id", "text"])
    df["clean_text"] = df["text"].apply(clean_text)
    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")
    return df

queries_df = load_and_clean_queries(QUERIES_PATH)

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = line.strip().split("\t")
            if len(parts) < 2: continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

# === استرجاع المستندات باستخدام BM25 ===
retrieved_docs_dict = {}
for qid, tokens in tqdm(queries_tokens.items(), desc="🔍 استرجاع المستندات باستخدام BM25"):
    ranked = retrieve_bm25_docs(tokens, bm25_data, top_k=10)
    retrieved_docs_dict[qid] = ranked

# === دوال التقييم نفسها من كودك الأصلي ===

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], skiprows=1)
    df = df.dropna(subset=["query_id", "doc_id", "relevance"])
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict, relevant_set = defaultdict(dict), defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]).strip(), str(row["doc_id"]).strip(), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0: relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall
    return np.mean(list(recall_scores.values())) * 100

# === تحميل الـ qrels ===
qrel_dict, real_relevant = get_qrels(QRELS_PATH)

# === استرجاع المستندات بدون الدرجات فقط ===
retrieved_only_docs = get_retrieved_docs_formatted(retrieved_docs_dict)

# === تقييم الأداء ===
map_score = calculate_map(qrel_dict, retrieved_only_docs)
mrr_score = calculate_mrr(qrel_dict, retrieved_only_docs)
mean_precision = calculate_mean_precision(qrel_dict, retrieved_only_docs)
mean_recall = calculate_mean_recall(qrel_dict, real_relevant, retrieved_only_docs)

print(f"MAP: {map_score:.2f}%")
print(f"MRR: {mrr_score:.2f}%")
print(f"Mean Precision: {mean_precision:.2f}%")
print(f"Mean Recall: {mean_recall:.2f}%")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


تم بناء وحفظ نموذج BM25


🔍 استرجاع المستندات باستخدام BM25: 100%|██████████| 200/200 [02:35<00:00,  1.29it/s]


MAP: 22.49%
MRR: 93.60%
Mean Precision: 73.20%
Mean Recall: 24.05%


#TFIDF WITH TOPK=100

In [None]:
#تثبيت الحزم (مرة واحدة فقط)
print(">> تثبيت الحزم (إذا لم تكن منصبة)...")
!pip install nltk==3.8.1 contractions scikit-learn tqdm


# استيراد المكتبات
print(">> استيراد المكتبات...")
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk, contractions
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# تحميل موارد NLTK
print(">> تحميل موارد NLTK...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index12.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens12.tsv"

print(">> تهيئة التنظيف والمعالجة...")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


# ربط الوسوم الصرفية بووردنت
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

# قائمة الاختصارات الشائعة
contractions_dict = {
    "u": "you",
    "r": "are",
    "wanna": "want to",
    "can't": "cannot",
    "don't": "do not",
    "didn't": "did not",
    "it's": "it is",
    "i'm": "i am",
    # يمكن إضافة المزيد
}

# توسيع الاختصارات
def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)


# المعالجة الأساسية للنص
def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    # text = normalize_date(text)
    text = text.encode("ascii", errors="ignore").decode()  # إزالة الرموز غير القابلة للترميز
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)     # حذف الروابط
    text = re.sub(r"\S+@\S+", "", text)                     # حذف الإيميلات
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # حذف علامات الترقيم
    text = re.sub(r"\b\d{1,2}\b", "", text)                 # حذف الأرقام الصغيرة فقط
    text = re.sub(r"\b\d{5,}\b", "", text)                  # حذف أرقام طويلة جداً (ممكن ID أو رقم هاتف)
    return text

# توكنيزيشن وتلميتايزيشن
def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

# الدالة النهائية
def clean_text(text):
    preprocessing = processing(text)
    tokenizing = tokenize(preprocessing)
    return tokenizing

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def save_cleaned_docs(df, dataset_name="antique", path_out="cleaned_docs12.tsv"):
    """
    تنظيف المستندات وتخزينها في ملف TSV مع الأعمدة التالية:
    doc_id, text, processed_text, dataset_name, light_clean_text
    """
    cleaned_data = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
        doc_id = row["doc_id"]
        raw_text = row["text"]

        # التنظيف العميق
        processed_tokens = clean_text(raw_text)
        processed_text = " ".join(processed_tokens)

        # التنظيف الخفيف
        light_text = light_clean(raw_text)

        cleaned_data.append({
            "doc_id": doc_id,
            "text": raw_text,
            "processed_text": processed_text,
            "dataset_name": dataset_name,
            "light_clean_text": light_text
        })

    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(path_out, sep="\t", index=False)
    print(f"✅ تم حفظ الملف في: {path_out}")


print(">> تحميل البيانات...")
def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)
print(f"   - عدد المستندات: {len(df)}")

save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out="cleaned_docs12.tsv")



print(">> بناء الفهرس العكسي (Inverted Index)...")
def build_inverted_index(df):
    index = defaultdict(set)
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🔧 بناء الفهرس"):
        for token in clean_text(row["text"]):
            index[token].add(str(row["doc_id"]))
    os.makedirs(os.path.dirname(INVERTED_PATH), exist_ok=True)
    with open(INVERTED_PATH, "w") as f:
        json.dump({k: list(v) for k, v in index.items()}, f)
    return index
inverted_index = build_inverted_index(df)
print(f"   - عدد كلمات الفهرس: {len(inverted_index)}")

print(">> تحميل وتنظيف الاستعلامات...")

def load_and_clean_queries(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                query_id = parts[0]
                text = " ".join(parts[1:])
                data.append((query_id, text))

    df = pd.DataFrame(data, columns=["query_id", "text"])
    df["clean_text"] = df["text"].apply(clean_text)

    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")

    return df


queries_df = load_and_clean_queries(QUERIES_PATH)
print(f"   - عدد الاستعلامات: {len(queries_df)}")

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = line.strip().split("\t")
            if len(parts) < 2: continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict
queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

print(">> تحميل الفهرس العكسي من الملف...")
def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)
inverted_index = load_inverted_index(INVERTED_PATH)

print(">> استرجاع المستندات واستعمال TF-IDF...")
print("from file")
tqdm.pandas(desc=">> تجهيز النصوص")

# نظف النصوص مسبقاً
# df["clean_text"] = df["text"].progress_apply(lambda x: " ".join(clean_text(x)))

# # نموذج TF-IDF
# tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf12.joblib"

# if os.path.exists(tfidf_model_path):
#     tfidf_data = joblib.load(tfidf_model_path)
#     vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
# else:
#     print(">> من البداية: تدريب نموذج TF-IDF")
#     vectorizer = TfidfVectorizer(lowercase=False)  # لا حاجة لـ tokenizer/preprocessor الآن
#     X = vectorizer.fit_transform(df["clean_text"])
#     tfidf_doc_ids = df["doc_id"].astype(str).tolist()
#     os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
#     joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

df["clean_text"] = df["text"].progress_apply(lambda x: " ".join(clean_text(x)))

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    print(">> من البداية: تدريب نموذج TF-IDF")
     vectorizer = TfidfVectorizer(
     preprocessor=processing,
     tokenizer=tokenize,
     lowercase=False,
     token_pattern=None
    )
    X = vectorizer.fit_transform(df["clean_text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)


def represent_query(tokens): return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices: return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx if sims[i] > 0]

print(">> استرجاع وترتيب المستندات لكل الاستعلامات...")
retrieved_docs_dict = {}
for i, (qid, tokens) in enumerate(tqdm(queries_tokens.items(), desc="🔍 استرجاع وترتيب المستندات"), start=1):
    retrieved = retrieve_docs(tokens, inverted_index)
    query_vec = represent_query(tokens)
    ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=100)
    retrieved_docs_dict[qid] = ranked

    if i % 10 == 0 or i == len(queries_tokens):
        print(f"   - تمت معالجة {i} من {len(queries_tokens)} استعلامات")

# التقييم
print(">> تحميل qrels وتحضير التقييم...")
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], skiprows=1)
    df = df.dropna(subset=["query_id", "doc_id", "relevance"])
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict, relevant_set = defaultdict(dict), defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]).strip(), str(row["doc_id"]).strip(), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0: relevant_set[qid].add(docid)
    return qrel_dict, relevant_set


def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        # if num_relevant == 0:
        #     continue  # تجاهل الاستعلامات التي ليس لها مستندات ذات صلة
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0


def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

print(">> بدء التقييم النهائي...")
def cal_evaluations(dataset_name="antique"):
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs), 2), "%")
    mean_recall, recall_scores = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, precision_scores = calculate_precision_at_k(qrel_dict, retrieved_docs)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

cal_evaluations("antique")



>> تثبيت الحزم (إذا لم تكن منصبة)...
>> استيراد المكتبات...
>> تحميل موارد NLTK...
>> تهيئة التنظيف والمعالجة...
>> تحميل البيانات...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   - عدد المستندات: 403458


🧼 تنظيف وتخزين المستندات: 100%|██████████| 403458/403458 [17:14<00:00, 389.84it/s]


✅ تم حفظ الملف في: cleaned_docs12.tsv
>> بناء الفهرس العكسي (Inverted Index)...


🔧 بناء الفهرس: 100%|██████████| 403458/403458 [17:24<00:00, 386.35it/s]


   - عدد كلمات الفهرس: 160415
>> تحميل وتنظيف الاستعلامات...
   - عدد الاستعلامات: 200
>> تحميل الفهرس العكسي من الملف...
>> استرجاع المستندات واستعمال TF-IDF...
from file


>> تجهيز النصوص: 100%|██████████| 403458/403458 [15:48<00:00, 425.40it/s]


>> من البداية: تدريب نموذج TF-IDF
>> استرجاع وترتيب المستندات لكل الاستعلامات...


🔍 استرجاع وترتيب المستندات:   6%|▌         | 11/200 [00:02<00:35,  5.26it/s]

   - تمت معالجة 10 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  10%|█         | 21/200 [00:04<00:28,  6.24it/s]

   - تمت معالجة 20 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  15%|█▌        | 30/200 [00:06<00:39,  4.34it/s]

   - تمت معالجة 30 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  20%|██        | 41/200 [00:08<00:27,  5.71it/s]

   - تمت معالجة 40 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  26%|██▌       | 51/200 [00:10<00:30,  4.86it/s]

   - تمت معالجة 50 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  30%|███       | 61/200 [00:12<00:25,  5.41it/s]

   - تمت معالجة 60 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  35%|███▌      | 70/200 [00:13<00:22,  5.67it/s]

   - تمت معالجة 70 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  40%|████      | 81/200 [00:15<00:19,  6.18it/s]

   - تمت معالجة 80 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  45%|████▌     | 90/200 [00:17<00:18,  5.87it/s]

   - تمت معالجة 90 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  50%|█████     | 101/200 [00:19<00:19,  5.17it/s]

   - تمت معالجة 100 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  56%|█████▌    | 111/200 [00:21<00:16,  5.24it/s]

   - تمت معالجة 110 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  60%|██████    | 120/200 [00:23<00:22,  3.48it/s]

   - تمت معالجة 120 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  65%|██████▌   | 130/200 [00:26<00:12,  5.47it/s]

   - تمت معالجة 130 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  70%|███████   | 141/200 [00:28<00:09,  6.12it/s]

   - تمت معالجة 140 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  75%|███████▌  | 150/200 [00:29<00:08,  5.67it/s]

   - تمت معالجة 150 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  80%|████████  | 161/200 [00:31<00:05,  7.09it/s]

   - تمت معالجة 160 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  86%|████████▌ | 171/200 [00:33<00:05,  5.77it/s]

   - تمت معالجة 170 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  90%|█████████ | 181/200 [00:35<00:03,  6.31it/s]

   - تمت معالجة 180 من 200 استعلامات


🔍 استرجاع وترتيب المستندات:  96%|█████████▌| 191/200 [00:38<00:02,  4.22it/s]

   - تمت معالجة 190 من 200 استعلامات


🔍 استرجاع وترتيب المستندات: 100%|██████████| 200/200 [00:40<00:00,  4.94it/s]


   - تمت معالجة 200 من 200 استعلامات
>> تحميل qrels وتحضير التقييم...
>> بدء التقييم النهائي...

== Evaluation for dataset: antique ==
MAP: 20.09 %
MRR: 76.6 %
Mean Precision: 16.36 %
Mean Recall: 44.08 %
Mean Precision@10: 38.45 %


#EMBEDDING

In [2]:
import os, re, string, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# ======== تحميل موارد NLTK ========
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# ======== إعداد المسارات ========
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_TSV_PATH =f"/content/drive/MyDrive/utils/clean_docs/antique_cleaned_docs_antique"
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
BERT_MODEL_PATH = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
CLEAN_QUERIES_PATH = "/content/drive/MyDrive/utils/queries_tokens/light_cleaned_queries_antique.tsv"
EMBEDDING_OUTPUT_PATH = "/content/drive/MyDrive/embedding_model_joblib_file"
EMBEDDING_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_antique.joblib")
TOP_K = 50

bert_model = SentenceTransformer(BERT_MODEL_PATH)

# ======== الدوال المساعدة للنصوص ========
def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ======== تحميل المستندات وتمثيلها ========
def load_or_generate_doc_embeddings():
    if os.path.exists(EMBEDDING_PATH):
        print(f"📦 تحميل التمثيلات من: {EMBEDDING_PATH}")
        return joblib.load(EMBEDDING_PATH)

    print("🚀 لم يتم العثور على التمثيلات، جاري التوليد...")
    df_docs = pd.read_csv(DOCS_TSV_PATH, sep="\t")
    print(f"🗂️ عدد المستندات المحملة: {len(df_docs)}")

    if "light_clean_text" not in df_docs.columns:
        print("⚠️ عمود 'light_clean_text' غير موجود — سيتم توليده.")
        df_docs["light_clean_text"] = df_docs["light_clean_text"].fillna("").astype(str)

    print("🧪 عينة من المستندات المنظفة:")
    print(df_docs[["doc_id", "light_clean_text"]].head())

    doc_embeddings = bert_model.encode(
        df_docs["light_clean_text"].tolist(),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    doc_ids = df_docs["doc_id"].astype(str).tolist()
    raw_docs = dict(zip(doc_ids, df_docs["text"]))

    os.makedirs(EMBEDDING_OUTPUT_PATH, exist_ok=True)
    joblib.dump({
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }, EMBEDDING_PATH)

    print(f"✅ تم حفظ التمثيلات في: {EMBEDDING_PATH}")
    return {
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }


def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")

    # معالجة الملف يدويًا لأن التنسيق غير منتظم (الكلمات موزعة على أعمدة كثيرة)
    cleaned_queries = []

    with open(path, 'r', encoding='utf-8') as f:
        next(f)  # تخطي الرأس
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                query_id = parts[0]
                text = ' '.join(parts[1:])
                cleaned_queries.append((query_id, text))

    # إنشاء DataFrame
    df = pd.DataFrame(cleaned_queries, columns=['query_id', 'text'])

    # تنظيف النصوص
    df["light_clean_text"] = df["text"].apply(light_clean)

    print(f"🔎 عدد الاستعلامات بعد التنظيف: {len(df)}")

    print("🧪 عينة استعلامات:")
    print(df.head(3))

    # حفظ النسخة النظيفة إلى ملف TSV بصيغة: query_id \t light_clean_text
    with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df

# def load_and_clean_queries(path):
#     print(f"📥 تحميل الاستعلامات من: {path}")
#     df = pd.read_csv(path, sep="\t", header=0, dtype=str)
#     print(f"🔎 عدد الاستعلامات بعد التنظيف: {len(df)}")

#     df["light_clean_text"] = df["text"].apply(light_clean)

#     print("🧪 عينة استعلامات:")
#     print(df.head(3))

#     with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
#         for _, row in df.iterrows():
#             f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

#     return df


# ======== تمثيل الاستعلامات ========
def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            tokens = parts[1].split()
            if not all(isinstance(t, str) for t in tokens):
                continue
            tokens_dict[str(parts[0])] = tokens
    return tokens_dict


def encode_queries(queries_tokens_dict, model):
    query_ids = []
    query_texts = []
    for qid, tokens in queries_tokens_dict.items():
        if isinstance(tokens, list) and all(isinstance(t, str) for t in tokens):
            query_ids.append(qid)
            query_texts.append(" ".join(tokens))
    print(f"🧠 تم تمثيل {len(query_texts)} استعلام")
    embeddings = model.encode(query_texts, convert_to_numpy=True, normalize_embeddings=True)
    return dict(zip(query_ids, embeddings))

# ======== الاسترجاع ========
def retrieve_with_embedding(query_embeddings_dict, doc_embeddings, doc_ids, top_k=10):
    results = {}
    for qid, q_embed in query_embeddings_dict.items():
        sims = cosine_similarity([q_embed], doc_embeddings).flatten()
        ranked_idx = np.argsort(-sims)[:top_k]
        results[qid] = [(doc_ids[i], float(sims[i])) for i in ranked_idx]
    return results

# ======== تحميل QRELS ========
# def get_qrels(path=QRELS_PATH):
#     df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
#     df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

#     qrel_dict = defaultdict(dict)
#     relevant_set = defaultdict(set)
#     for _, row in df.iterrows():
#         qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
#         qrel_dict[qid][docid] = rel
#         if rel > 0:
#             relevant_set[qid].add(docid)
#     return qrel_dict, relevant_set

def get_qrels(path=QRELS_PATH):
    # قراءة الملف مع السماح بأي مسافات أو تاب كفاصل
    df = pd.read_csv(path, sep=r'\s+', header=0, names=["query_id", "doc_id", "relevance"])

    # تأكد أن عمود relevance رقمي
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set


# ======== التقييم ========
def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

# ======== التقييم النهائي ========
def cal_evaluations(retrieved_docs, dataset_name="antique"):
    print("\n📊 بدء التقييم...")
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs).items()}
    print(f"📌 استعلامات للتقييم: {len(retrieved_docs_str)}")
    print("🔎 عدد استعلامات qrels:", len(qrel_dict))
    print("🔎 عدد استعلامات الاسترجاع:", len(retrieved_docs_str))
    print("🔗 عدد الاستعلامات المشتركة بين qrels والاسترجاع:", len(set(qrel_dict.keys()) & set(retrieved_docs_str.keys())))

# ⛔ تحليل المستندات المفقودة في التمثيل
    all_qrels_doc_ids = {docid for rels in qrel_dict.values() for docid in rels}
    missing_doc_ids = all_qrels_doc_ids - set(doc_ids)
    print(f"❗ عدد المستندات المشار لها في qrels لكنها غير موجودة في التمثيل: {len(missing_doc_ids)}")

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ======== تشغيل كامل ========
if __name__ == "__main__":
    embedding_data = load_or_generate_doc_embeddings()
    doc_embeddings = embedding_data["embeddings"]
    doc_ids = embedding_data["doc_ids"]

    queries_df = load_and_clean_queries(QUERIES_PATH)
    queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES_PATH)
    query_embeddings = encode_queries(queries_tokens, model=bert_model)

    print(f"🧠 تم تمثيل {len(query_embeddings)} استعلام")

    retrieved_docs = retrieve_with_embedding(query_embeddings, doc_embeddings, doc_ids, top_k=TOP_K)
    print(f"📥 تم استرجاع نتائج لـ {len(retrieved_docs)} استعلام")

    cal_evaluations(retrieved_docs, dataset_name="antique")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


📦 تحميل التمثيلات من: /content/drive/MyDrive/embedding_model_joblib_file/antique_embedding_antique.joblib
📥 تحميل الاستعلامات من: /content/drive/MyDrive/datasets1/antique/queries.tsv
🔎 عدد الاستعلامات بعد التنظيف: 200
🧪 عينة استعلامات:
  query_id                                               text  \
0  3990512          how can we get concentration onsomething?   
1   714612  Why doesn't the water fall off earth if it's r...   
2  2528767  How do I determine the charge of the iron ion ...   

                                    light_clean_text  
0           how can we get concentration onsomething  
1  why doesn t the water fall off earth if it s r...  
2  how do i determine the charge of the iron ion ...  
🧠 تم تمثيل 200 استعلام
🧠 تم تمثيل 200 استعلام
📥 تم استرجاع نتائج لـ 200 استعلام

📊 بدء التقييم...
📌 استعلامات للتقييم: 200
🔎 عدد استعلامات qrels: 200
🔎 عدد استعلامات الاسترجاع: 200
🔗 عدد الاستعلامات المشتركة بين qrels والاسترجاع: 200
❗ عدد المستندات المشار لها في qrels لكنها غير موج

#HYBRID

In [13]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm
!pip install country_converter --upgrade
!pip install datefinder
!pip install -U sentence-transformers

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from datetime import datetime
from tqdm import tqdm
import nltk

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_antique1.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_antique.joblib"
embedding_file_path = f"/content/drive/MyDrive/embedding_model_joblib_file/{DATASET_NAME}_embedding_antique.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos))
            for word, pos in tokens_pos
            if word not in stop_words and len(word) > 1]

def clean_text(text):
    return tokenize(processing(text))

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")

    # نقرأ الملف نصياً ثم نحول \t إلى تبويب حقيقي
    with open(path, encoding="utf-8") as f:
        lines = [line.encode().decode("unicode_escape").strip() for line in f if line.strip()]

    query_ids = []
    texts = []

    for i, line in enumerate(lines, 1):
        parts = line.split('\t', 1)  # نقسم فقط على أول تبويب
        if len(parts) != 2:
            print(f"⚠️ تجاهل السطر {i}: يحتوي على {len(parts)} أعمدة بدلاً من 2")
            continue
        query_ids.append(parts[0])
        texts.append(parts[1])

    df = pd.DataFrame({'query_id': query_ids, 'text': texts})
    print(f"🔎 عدد الاستعلامات بعد التنظيف: {len(df)}")

    df["light_clean_text"] = df["text"].apply(light_clean)

    print("🧪 عينة استعلامات:")
    print(df.head(3))

    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df


queries_df = load_and_clean_queries(QUERIES_PATH)

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) != 2:
                print(f"⚠️ تجاهل السطر {i}: فيه {len(parts)} جزء بدل 2")
                continue
            tokens = parts[1].split()
            if not all(isinstance(t, str) for t in tokens):
                continue
            tokens_dict[str(parts[0])] = tokens
    return tokens_dict

# تحميل بيانات الاستعلامات
queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

# تحميل نموذج TF-IDF
tfidf_data = joblib.load(tfidf_model_path)
vectorizer = tfidf_data["vectorizer"]
X = tfidf_data["vectors"]
tfidf_doc_ids = tfidf_data["doc_ids"]

# تحميل نموذج Embedding والتمثيلات
embedding_model_path = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_path)
embedding_data = joblib.load(embedding_file_path)
doc_embeddings = embedding_data["embeddings"]
embedding_doc_ids = embedding_data["doc_ids"]

def represent_query_tfidf(tokens):
    return vectorizer.transform([" ".join(tokens)])

def represent_query_embedding(query_text):
    return embedding_model.encode([query_text], convert_to_numpy=True)[0]

def hybrid_rank(query_tokens, query_text, top_k=10, alpha=0.5):
    tfidf_vec = represent_query_tfidf(query_tokens)
    embedding_vec = represent_query_embedding(query_text)

    tfidf_sims = cosine_similarity(tfidf_vec, X).flatten()

    embedding_sims = cosine_similarity([embedding_vec], doc_embeddings).flatten()
    embedding_sims = embedding_sims[1:]

    hybrid_sims = alpha * tfidf_sims + (1 - alpha) * embedding_sims
    top_indices = np.argsort(-hybrid_sims)[:top_k]
    return [(tfidf_doc_ids[i], hybrid_sims[i]) for i in top_indices]

# تطبيق الاسترجاع لجميع الاستعلامات
retrieved_docs_dict = {}
for qid, tokens in tqdm(queries_tokens.items(), desc="Hybrid Retrieval"):
    text = " ".join(tokens)
    results = hybrid_rank(tokens, text, top_k=10, alpha=0.5)
    retrieved_docs_dict[qid] = results

# التقييم (مثل السابق)
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep=r'\s+', header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0: continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, []))
        if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(dataset_name="antique"):
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, recall_scores = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, precision_scores = calculate_precision_at_k(qrel_dict, retrieved_docs_str, k=10)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# استدعاء التقييم
cal_evaluations("antique")





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


📥 تحميل الاستعلامات من: /content/drive/MyDrive/datasets1/antique/queries.tsv
⚠️ تجاهل السطر 1: يحتوي على 1 أعمدة بدلاً من 2
🔎 عدد الاستعلامات بعد التنظيف: 200
🧪 عينة استعلامات:
  query_id                                               text  \
0  3990512     how\tcan\twe\tget\tconcentration\tonsomething?   
1   714612  Why\tdoesn't\tthe\twater\tfall\toff\tearth\tif...   
2  2528767  How\tdo\tI\tdetermine\tthe\tcharge\tof\tthe\ti...   

                                    light_clean_text  
0           how can we get concentration onsomething  
1  why doesn t the water fall off earth if it s r...  
2  how do i determine the charge of the iron ion ...  


Hybrid Retrieval: 100%|██████████| 200/200 [03:22<00:00,  1.01s/it]



== Evaluation for dataset: antique ==
MAP: 8.32 %
MRR: 72.42 %
Mean Precision: 35.35 %
Mean Recall: 11.29 %
Mean Precision@10: 35.35 %


#EMBEDDING WITH VECTOR STORE

In [4]:
!pip install nltk==3.8.1 contractions scikit-learn tqdm faiss-cpu
!pip install -U sentence-transformers

import os, re, string, time, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

# ======== تحميل موارد NLTK ========
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# ======== إعداد المسارات ========
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/antique_cleaned_docs_antique"
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
BERT_MODEL_PATH = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
CLEAN_QUERIES_PATH = "light_cleaned_queries.tsv"
EMBEDDING_OUTPUT_PATH = "/content/drive/MyDrive/embedding_model_joblib_file"
EMBEDDING_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_antique.joblib")
FAISS_INDEX_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding.index")
FAISS_META_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_meta.json")
TOP_K = 50
RETRIEVAL_MODE = "faiss"  # اختر بين "cosine" أو "faiss"

bert_model = SentenceTransformer(BERT_MODEL_PATH)

# ======== تنظيف نصوص ========
def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ======== تحميل أو توليد تمثيلات المستندات ========
def load_or_generate_doc_embeddings():
    if os.path.exists(EMBEDDING_PATH):
        print(f"📦 تحميل التمثيلات من: {EMBEDDING_PATH}")
        return joblib.load(EMBEDDING_PATH)

    print("🚀 لم يتم العثور على التمثيلات، جاري التوليد...")
    df_docs = pd.read_csv(DOCS_TSV_PATH, sep="\t")
    print(f"🗂️ عدد المستندات المحملة: {len(df_docs)}")

    if "light_clean_text" not in df_docs.columns:
        df_docs["light_clean_text"] = df_docs["text"].astype(str).apply(light_clean)

    doc_embeddings = bert_model.encode(
        df_docs["light_clean_text"].tolist(),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    doc_ids = df_docs["doc_id"].astype(str).tolist()
    raw_docs = dict(zip(doc_ids, df_docs["text"]))

    os.makedirs(EMBEDDING_OUTPUT_PATH, exist_ok=True)
    joblib.dump({
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }, EMBEDDING_PATH)

    # حفظ metadata لـ FAISS لاحقاً
    with open(FAISS_META_PATH, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم حفظ التمثيلات في: {EMBEDDING_PATH}")
    return {
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }


def build_and_save_faiss_index(embeddings, index_path, meta_path, doc_ids):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)  # استخدم نوع الفهرس المناسب
    index.add(embeddings.astype("float32"))
    faiss.write_index(index, index_path)

    # حفظ الـ metadata (doc_ids)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم بناء وحفظ فهرس FAISS في: {index_path}")


# ======== تحميل الاستعلامات ========

def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")

    # نقرأ الملف نصياً ثم نحول \t إلى تبويب حقيقي
    with open(path, encoding="utf-8") as f:
        lines = [line.encode().decode("unicode_escape").strip() for line in f if line.strip()]

    query_ids = []
    texts = []

    for i, line in enumerate(lines, 1):
        parts = line.split('\t', 1)  # نقسم فقط على أول تبويب
        if len(parts) != 2:
            print(f"⚠️ تجاهل السطر {i}: يحتوي على {len(parts)} أعمدة بدلاً من 2")
            continue
        query_ids.append(parts[0])
        texts.append(parts[1])

    df = pd.DataFrame({'query_id': query_ids, 'text': texts})
    print(f"🔎 عدد الاستعلامات بعد التنظيف: {len(df)}")

    df["light_clean_text"] = df["text"].apply(light_clean)

    print("🧪 عينة استعلامات:")
    print(df.head(3))

    with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df


queries_df = load_and_clean_queries(QUERIES_PATH)


def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

def encode_queries(queries_tokens_dict, model):
    query_ids = list(queries_tokens_dict.keys())
    query_texts = [" ".join(tokens) for tokens in queries_tokens_dict.values()]
    embeddings = model.encode(query_texts, convert_to_numpy=True, normalize_embeddings=True)
    return dict(zip(query_ids, embeddings))

# ======== استرجاع ========
def retrieve_with_embedding(query_embeddings_dict, doc_embeddings, doc_ids, top_k=10):
    results = {}
    for qid, q_embed in query_embeddings_dict.items():
        sims = cosine_similarity([q_embed], doc_embeddings).flatten()
        ranked_idx = np.argsort(-sims)[:top_k]
        results[qid] = [(doc_ids[i], float(sims[i])) for i in ranked_idx]
    return results

def retrieve_with_faiss(query_embeddings_dict, index_path, doc_ids, top_k=10):
    index = faiss.read_index(index_path)
    query_ids = list(query_embeddings_dict.keys())
    query_embeddings = np.array([query_embeddings_dict[qid] for qid in query_ids]).astype("float32")
    scores, indices = index.search(query_embeddings, top_k)

    results = {}
    for i, qid in enumerate(query_ids):
        results[qid] = [(doc_ids[idx], float(scores[i][j])) for j, idx in enumerate(indices[i])]
    return results

def load_doc_ids_from_meta(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# ======== تحميل QRELS ========
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep=r'\s+', header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

# ======== التقييم ========
def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(retrieved_docs, dataset_name="antique"):
    print("\n📊 بدء التقييم...")
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs).items()}

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ======== التشغيل الكامل ========
if __name__ == "__main__":
    embedding_data = load_or_generate_doc_embeddings()
    build_and_save_faiss_index(embedding_data["embeddings"], FAISS_INDEX_PATH, FAISS_META_PATH, embedding_data["doc_ids"])
    doc_embeddings = embedding_data["embeddings"]
    doc_ids = embedding_data["doc_ids"]

    queries_df = load_and_clean_queries(QUERIES_PATH)
    queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES_PATH)
    query_embeddings = encode_queries(queries_tokens, model=bert_model)

    print(f"🧠 تم تمثيل {len(query_embeddings)} استعلام")

    if RETRIEVAL_MODE == "cosine":
        print("\n🚀 الاسترجاع باستخدام cosine_similarity...")
        start = time.time()
        retrieved_docs = retrieve_with_embedding(query_embeddings, doc_embeddings, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    elif RETRIEVAL_MODE == "faiss":
        print("\n⚡ الاسترجاع باستخدام FAISS...")
        doc_ids = load_doc_ids_from_meta(FAISS_META_PATH)
        start = time.time()
        retrieved_docs = retrieve_with_faiss(query_embeddings, FAISS_INDEX_PATH, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    else:
        raise ValueError(f"❌ Retrieval mode '{RETRIEVAL_MODE}' not supported.")

    print(f"📥 تم استرجاع نتائج لـ {len(retrieved_docs)} استعلام في {round(duration, 2)} ثانية")

    cal_evaluations(retrieved_docs, dataset_name=f"{DATASET_NAME}-{RETRIEVAL_MODE}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


📥 تحميل الاستعلامات من: /content/drive/MyDrive/datasets1/antique/queries.tsv
⚠️ تجاهل السطر 1: يحتوي على 1 أعمدة بدلاً من 2
🔎 عدد الاستعلامات بعد التنظيف: 200
🧪 عينة استعلامات:
  query_id                                               text  \
0  3990512     how\tcan\twe\tget\tconcentration\tonsomething?   
1   714612  Why\tdoesn't\tthe\twater\tfall\toff\tearth\tif...   
2  2528767  How\tdo\tI\tdetermine\tthe\tcharge\tof\tthe\ti...   

                                    light_clean_text  
0           how can we get concentration onsomething  
1  why doesn t the water fall off earth if it s r...  
2  how do i determine the charge of the iron ion ...  
📦 تحميل التمثيلات من: /content/drive/MyDrive/embedding_model_joblib_file/antique_embedding_antique.joblib
✅ تم بناء وحفظ فهرس FAISS في: /content/drive/MyDrive/embedding_model_joblib_file/antique_embedding.index
📥 تحميل الاستعلامات من: /content/drive/MyDrive/datasets1/antique/queries.tsv
⚠️ تجاهل السطر 1: يحتوي على 1 أعمدة بدلاً من 2
🔎 عدد ا

#EMBEDDING WITH COSINE

In [5]:
!pip install nltk==3.8.1 contractions scikit-learn tqdm faiss-cpu
!pip install -U sentence-transformers

import os, re, string, time, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

# ======== تحميل موارد NLTK ========
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# ======== إعداد المسارات ========
DATASET_NAME = "antique"
BASE_PATH = f"/content/drive/MyDrive/datasets1/{DATASET_NAME}"
DOCS_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/antique_cleaned_docs_antique"
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
BERT_MODEL_PATH = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
CLEAN_QUERIES_PATH = "light_cleaned_queries.tsv"
EMBEDDING_OUTPUT_PATH = "/content/drive/MyDrive/embedding_model_joblib_file"
EMBEDDING_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_antique.joblib")
FAISS_INDEX_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding.index")
FAISS_META_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_meta.json")
TOP_K = 50
RETRIEVAL_MODE = "cosine"  # اختر بين "cosine" أو "faiss"

bert_model = SentenceTransformer(BERT_MODEL_PATH)

# ======== تنظيف نصوص ========
def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ======== تحميل أو توليد تمثيلات المستندات ========
def load_or_generate_doc_embeddings():
    if os.path.exists(EMBEDDING_PATH):
        print(f"📦 تحميل التمثيلات من: {EMBEDDING_PATH}")
        return joblib.load(EMBEDDING_PATH)

    print("🚀 لم يتم العثور على التمثيلات، جاري التوليد...")
    df_docs = pd.read_csv(DOCS_TSV_PATH, sep="\t")
    print(f"🗂️ عدد المستندات المحملة: {len(df_docs)}")

    if "light_clean_text" not in df_docs.columns:
        df_docs["light_clean_text"] = df_docs["text"].astype(str).apply(light_clean)

    doc_embeddings = bert_model.encode(
        df_docs["light_clean_text"].tolist(),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    doc_ids = df_docs["doc_id"].astype(str).tolist()
    raw_docs = dict(zip(doc_ids, df_docs["text"]))

    os.makedirs(EMBEDDING_OUTPUT_PATH, exist_ok=True)
    joblib.dump({
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }, EMBEDDING_PATH)

    # حفظ metadata لـ FAISS لاحقاً
    with open(FAISS_META_PATH, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم حفظ التمثيلات في: {EMBEDDING_PATH}")
    return {
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }


def build_and_save_faiss_index(embeddings, index_path, meta_path, doc_ids):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)  # استخدم نوع الفهرس المناسب
    index.add(embeddings.astype("float32"))
    faiss.write_index(index, index_path)

    # حفظ الـ metadata (doc_ids)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم بناء وحفظ فهرس FAISS في: {index_path}")


# ======== تحميل الاستعلامات ========

def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")

    # نقرأ الملف نصياً ثم نحول \t إلى تبويب حقيقي
    with open(path, encoding="utf-8") as f:
        lines = [line.encode().decode("unicode_escape").strip() for line in f if line.strip()]

    query_ids = []
    texts = []

    for i, line in enumerate(lines, 1):
        parts = line.split('\t', 1)  # نقسم فقط على أول تبويب
        if len(parts) != 2:
            print(f"⚠️ تجاهل السطر {i}: يحتوي على {len(parts)} أعمدة بدلاً من 2")
            continue
        query_ids.append(parts[0])
        texts.append(parts[1])

    df = pd.DataFrame({'query_id': query_ids, 'text': texts})
    print(f"🔎 عدد الاستعلامات بعد التنظيف: {len(df)}")

    df["light_clean_text"] = df["text"].apply(light_clean)

    print("🧪 عينة استعلامات:")
    print(df.head(3))

    with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df


queries_df = load_and_clean_queries(QUERIES_PATH)


def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

def encode_queries(queries_tokens_dict, model):
    query_ids = list(queries_tokens_dict.keys())
    query_texts = [" ".join(tokens) for tokens in queries_tokens_dict.values()]
    embeddings = model.encode(query_texts, convert_to_numpy=True, normalize_embeddings=True)
    return dict(zip(query_ids, embeddings))

# ======== استرجاع ========
def retrieve_with_embedding(query_embeddings_dict, doc_embeddings, doc_ids, top_k=10):
    results = {}
    for qid, q_embed in query_embeddings_dict.items():
        sims = cosine_similarity([q_embed], doc_embeddings).flatten()
        ranked_idx = np.argsort(-sims)[:top_k]
        results[qid] = [(doc_ids[i], float(sims[i])) for i in ranked_idx]
    return results

def retrieve_with_faiss(query_embeddings_dict, index_path, doc_ids, top_k=10):
    index = faiss.read_index(index_path)
    query_ids = list(query_embeddings_dict.keys())
    query_embeddings = np.array([query_embeddings_dict[qid] for qid in query_ids]).astype("float32")
    scores, indices = index.search(query_embeddings, top_k)

    results = {}
    for i, qid in enumerate(query_ids):
        results[qid] = [(doc_ids[idx], float(scores[i][j])) for j, idx in enumerate(indices[i])]
    return results

def load_doc_ids_from_meta(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# ======== تحميل QRELS ========
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep=r'\s+', header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

# ======== التقييم ========
def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(retrieved_docs, dataset_name="antique"):
    print("\n📊 بدء التقييم...")
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs).items()}

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ======== التشغيل الكامل ========
if __name__ == "__main__":
    embedding_data = load_or_generate_doc_embeddings()
    build_and_save_faiss_index(embedding_data["embeddings"], FAISS_INDEX_PATH, FAISS_META_PATH, embedding_data["doc_ids"])
    doc_embeddings = embedding_data["embeddings"]
    doc_ids = embedding_data["doc_ids"]

    queries_df = load_and_clean_queries(QUERIES_PATH)
    queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES_PATH)
    query_embeddings = encode_queries(queries_tokens, model=bert_model)

    print(f"🧠 تم تمثيل {len(query_embeddings)} استعلام")

    if RETRIEVAL_MODE == "cosine":
        print("\n🚀 الاسترجاع باستخدام cosine_similarity...")
        start = time.time()
        retrieved_docs = retrieve_with_embedding(query_embeddings, doc_embeddings, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    elif RETRIEVAL_MODE == "faiss":
        print("\n⚡ الاسترجاع باستخدام FAISS...")
        doc_ids = load_doc_ids_from_meta(FAISS_META_PATH)
        start = time.time()
        retrieved_docs = retrieve_with_faiss(query_embeddings, FAISS_INDEX_PATH, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    else:
        raise ValueError(f"❌ Retrieval mode '{RETRIEVAL_MODE}' not supported.")

    print(f"📥 تم استرجاع نتائج لـ {len(retrieved_docs)} استعلام في {round(duration, 2)} ثانية")

    cal_evaluations(retrieved_docs, dataset_name=f"{DATASET_NAME}-{RETRIEVAL_MODE}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


📥 تحميل الاستعلامات من: /content/drive/MyDrive/datasets1/antique/queries.tsv
⚠️ تجاهل السطر 1: يحتوي على 1 أعمدة بدلاً من 2
🔎 عدد الاستعلامات بعد التنظيف: 200
🧪 عينة استعلامات:
  query_id                                               text  \
0  3990512     how\tcan\twe\tget\tconcentration\tonsomething?   
1   714612  Why\tdoesn't\tthe\twater\tfall\toff\tearth\tif...   
2  2528767  How\tdo\tI\tdetermine\tthe\tcharge\tof\tthe\ti...   

                                    light_clean_text  
0           how can we get concentration onsomething  
1  why doesn t the water fall off earth if it s r...  
2  how do i determine the charge of the iron ion ...  
📦 تحميل التمثيلات من: /content/drive/MyDrive/embedding_model_joblib_file/antique_embedding_antique.joblib
✅ تم بناء وحفظ فهرس FAISS في: /content/drive/MyDrive/embedding_model_joblib_file/antique_embedding.index
📥 تحميل الاستعلامات من: /content/drive/MyDrive/datasets1/antique/queries.tsv
⚠️ تجاهل السطر 1: يحتوي على 1 أعمدة بدلاً من 2
🔎 عدد ا