In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


TFIDF_EVALUATION

In [None]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm
!pip install country_converter --upgrade
!pip install datefinder

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from tqdm import tqdm
import nltk

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessing = processing(text)
    tokenizing = tokenize(preprocessing)
    return tokenizing

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text




def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH):
    cleaned_data = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
        doc_id = row["doc_id"]
        raw_text = row["text"]
        processed_tokens = clean_text(raw_text)
        processed_text = " ".join(processed_tokens)
        light_text = light_clean(raw_text)

        cleaned_data.append({
            "doc_id": doc_id,
            "text": raw_text,
            "processed_text": processed_text,
            "dataset_name": dataset_name,
            "light_clean_text": light_text
        })

    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(path_out, sep="\t", index=False)


def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)
save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH)



def build_inverted_index_from_tsv(tsv_path=CLEANED_TSV_PATH):
    df = pd.read_csv(tsv_path, sep="\t")
    index = defaultdict(set)
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🔧 بناء الفهرس"):
        doc_id = str(row["doc_id"])
        tokens = str(row["processed_text"]).split()
        for token in tokens:
            index[token].add(doc_id)
    os.makedirs(os.path.dirname(INVERTED_PATH), exist_ok=True)
    with open(INVERTED_PATH, "w", encoding="utf-8") as f:
        json.dump({k: list(v) for k, v in index.items()}, f, ensure_ascii=False, indent=2)
    return index

inverted_index = build_inverted_index_from_tsv()



def load_and_clean_queries(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                query_id = parts[0]
                text = " ".join(parts[1:])
                data.append((query_id, text))
    df = pd.DataFrame(data, columns=["query_id", "text"])
    df["clean_text"] = df["text"].apply(clean_text)
    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")
    return df

queries_df = load_and_clean_queries(QUERIES_PATH)




def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = line.strip().split("\t")
            if len(parts) < 2: continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)




def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)



def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)


tqdm.pandas(desc=">> تجهيز النصوص")

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    vectorizer = TfidfVectorizer(
        preprocessor=processing,
        tokenizer=tokenize,
        lowercase=False,
        token_pattern=None
    )
    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]



def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])


def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]


retrieved_docs_dict = {}
for i, (qid, tokens) in enumerate(tqdm(queries_tokens.items(), desc="🔍 استرجاع وترتيب المستندات"), start=1):
    retrieved = retrieve_docs(tokens, inverted_index)
    query_vec = represent_query(tokens)
    ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
    retrieved_docs_dict[qid] = ranked



def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set


def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}



def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0


def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100


def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100


def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores



def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions


def cal_evaluations(dataset_name="quora"):
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs_dict).items()}
    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, recall_scores = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, precision_scores = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

cal_evaluations("quora")











Collecting nltk==3.8.1
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
🧼 تنظيف وتخزين المستندات: 100%|██████████| 522770/522770 [09:26<00:00, 922.42it/s]
🔧 بناء الفهرس: 100%|██████████| 522770/522770 [00:32<00:00, 16055.12it/s]
🔍 استرجاع وترتيب المستندات: 100%|██████████| 5000/5000 [17:29<00:00,  4.76it/s]



== Evaluation for dataset: quora ==
MAP: 66.82 %
MRR: 70.9 %
Mean Precision: 11.05 %
Mean Recall: 81.78 %
Mean Precision@10: 10.94 %


EMBEDDING_EVALUATION

In [None]:
import os, re, string, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# ======== تحميل موارد NLTK ========
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# ======== إعداد المسارات ========
DATASET_NAME = "quora"
BASE_PATH = "/content/drive/MyDrive/dataset_quora_dev/"
DOCS_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/quora_cleaned_docs_beir"
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
BERT_MODEL_PATH = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
CLEAN_QUERIES_PATH = "light_cleaned_queries.tsv"
EMBEDDING_OUTPUT_PATH = "/content/drive/MyDrive/embedding_model_joblib_file"
EMBEDDING_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_beir.joblib")
TOP_K = 50

bert_model = SentenceTransformer(BERT_MODEL_PATH)

# ======== الدوال المساعدة للنصوص ========
def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ======== تحميل المستندات وتمثيلها ========
def load_or_generate_doc_embeddings():
    if os.path.exists(EMBEDDING_PATH):
        print(f"📦 تحميل التمثيلات من: {EMBEDDING_PATH}")
        return joblib.load(EMBEDDING_PATH)

    print("🚀 لم يتم العثور على التمثيلات، جاري التوليد...")
    df_docs = pd.read_csv(DOCS_TSV_PATH, sep="\t")
    print(f"🗂️ عدد المستندات المحملة: {len(df_docs)}")

    if "light_clean_text" not in df_docs.columns:
        print("⚠️ عمود 'light_clean_text' غير موجود — سيتم توليده.")
        df_docs["light_clean_text"] = df_docs["text"].astype(str).apply(light_clean)

    print("🧪 عينة من المستندات المنظفة:")
    print(df_docs[["doc_id", "light_clean_text"]].head())

    doc_embeddings = bert_model.encode(
        df_docs["light_clean_text"].tolist(),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    doc_ids = df_docs["doc_id"].astype(str).tolist()
    raw_docs = dict(zip(doc_ids, df_docs["text"]))

    os.makedirs(EMBEDDING_OUTPUT_PATH, exist_ok=True)
    joblib.dump({
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }, EMBEDDING_PATH)

    print(f"✅ تم حفظ التمثيلات في: {EMBEDDING_PATH}")
    return {
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }



def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")
    df = pd.read_csv(path, sep="\t", header=0, dtype=str)
    print(f"🔎 عدد الاستعلامات بعد التنظيف: {len(df)}")

    df["light_clean_text"] = df["text"].apply(light_clean)

    print("🧪 عينة استعلامات:")
    print(df.head(3))

    with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df


# ======== تمثيل الاستعلامات ========
def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

def encode_queries(queries_tokens_dict, model):
    query_ids = list(queries_tokens_dict.keys())
    query_texts = [" ".join(tokens) for tokens in queries_tokens_dict.values()]
    embeddings = model.encode(query_texts, convert_to_numpy=True, normalize_embeddings=True)
    return dict(zip(query_ids, embeddings))

# ======== الاسترجاع ========
def retrieve_with_embedding(query_embeddings_dict, doc_embeddings, doc_ids, top_k=10):
    results = {}
    for qid, q_embed in query_embeddings_dict.items():
        sims = cosine_similarity([q_embed], doc_embeddings).flatten()
        ranked_idx = np.argsort(-sims)[:top_k]
        results[qid] = [(doc_ids[i], float(sims[i])) for i in ranked_idx]
    return results

# ======== تحميل QRELS ========
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

# ======== التقييم ========
def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

# ======== التقييم النهائي ========
def cal_evaluations(retrieved_docs, dataset_name="quora"):
    print("\n📊 بدء التقييم...")
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs).items()}
    print(f"📌 استعلامات للتقييم: {len(retrieved_docs_str)}")
    print("🔎 عدد استعلامات qrels:", len(qrel_dict))
    print("🔎 عدد استعلامات الاسترجاع:", len(retrieved_docs_str))
    print("🔗 عدد الاستعلامات المشتركة بين qrels والاسترجاع:", len(set(qrel_dict.keys()) & set(retrieved_docs_str.keys())))

# ⛔ تحليل المستندات المفقودة في التمثيل
    all_qrels_doc_ids = {docid for rels in qrel_dict.values() for docid in rels}
    missing_doc_ids = all_qrels_doc_ids - set(doc_ids)
    print(f"❗ عدد المستندات المشار لها في qrels لكنها غير موجودة في التمثيل: {len(missing_doc_ids)}")

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ======== تشغيل كامل ========
if __name__ == "__main__":
    embedding_data = load_or_generate_doc_embeddings()
    doc_embeddings = embedding_data["embeddings"]
    doc_ids = embedding_data["doc_ids"]

    queries_df = load_and_clean_queries(QUERIES_PATH)
    queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES_PATH)
    query_embeddings = encode_queries(queries_tokens, model=bert_model)

    print(f"🧠 تم تمثيل {len(query_embeddings)} استعلام")

    retrieved_docs = retrieve_with_embedding(query_embeddings, doc_embeddings, doc_ids, top_k=TOP_K)
    print(f"📥 تم استرجاع نتائج لـ {len(retrieved_docs)} استعلام")

    cal_evaluations(retrieved_docs, dataset_name="quora")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


📦 تحميل التمثيلات من: /content/drive/MyDrive/embedding_model_joblib_file/quora_embedding_beir.joblib
📥 تحميل الاستعلامات من: /content/drive/MyDrive/dataset_quora_dev/queries.tsv
🔎 عدد الاستعلامات بعد التنظيف: 5000
🧪 عينة استعلامات:
  query_id                                               text  \
0      318                How does Quora look to a moderator?   
1      378  How do I refuse to chose between different thi...   
2      379  Did Ben Affleck shine more than Christian Bale...   

                                    light_clean_text  
0                 how does quora look to a moderator  
1  how do i refuse to chose between different thi...  
2  did ben affleck shine more than christian bale...  
🧠 تم تمثيل 5000 استعلام
📥 تم استرجاع نتائج لـ 5000 استعلام

📊 بدء التقييم...
📌 استعلامات للتقييم: 5000
🔎 عدد استعلامات qrels: 5000
🔎 عدد استعلامات الاسترجاع: 5000
🔗 عدد الاستعلامات المشتركة بين qrels والاسترجاع: 5000
❗ عدد المستندات المشار لها في qrels لكنها غير موجودة في التمثيل: 0

==

#BM25

In [None]:
# تثبيت الحزم (تحتاج مرة واحدة فقط)
!pip install nltk==3.8.1 contractions scikit-learn tqdm rank_bm25

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk, contractions
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from tqdm import tqdm

# تحميل موارد NLTK (مرة واحدة)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات كما في كودك
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/quora_cleaned_docs_beir"
BM25_MODEL_PATH = f"/content/drive/MyDrive/utils/bm25_model/{DATASET_NAME}_bm25.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessing = processing(text)
    tokenizing = tokenize(preprocessing)
    return tokenizing

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH):
    cleaned_data = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
        doc_id = row["doc_id"]
        raw_text = row["text"]
        processed_tokens = clean_text(raw_text)
        processed_text = " ".join(processed_tokens)
        light_text = light_clean(raw_text)

        cleaned_data.append({
            "doc_id": doc_id,
            "text": raw_text,
            "processed_text": processed_text,
            "dataset_name": dataset_name,
            "light_clean_text": light_text
        })

    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(path_out, sep="\t", index=False)

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

# === بناء نموذج BM25 ===
def build_bm25_model(cleaned_docs_tsv=CLEANED_TSV_PATH, save_path=BM25_MODEL_PATH):
    df = pd.read_csv(cleaned_docs_tsv, sep="\t")
    df = df[df['processed_text'].notna() & (df['processed_text'].str.strip() != "")]

    corpus = [doc.split() for doc in df["processed_text"]]
    doc_ids = df["doc_id"].astype(str).tolist()

    bm25 = BM25Okapi(corpus)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    joblib.dump({"bm25": bm25, "doc_ids": doc_ids, "corpus": corpus}, save_path)
    print("تم بناء وحفظ نموذج BM25")

def load_bm25_model(path=BM25_MODEL_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError("نموذج BM25 غير موجود، قم ببنائه أولاً.")
    return joblib.load(path)

def clean_and_tokenize_query(raw_query):
    return clean_text(raw_query)

def retrieve_bm25_docs(query_tokens, bm25_data, top_k=10):
    bm25 = bm25_data["bm25"]
    doc_ids = bm25_data["doc_ids"]
    scores = bm25.get_scores(query_tokens)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    results = [(doc_ids[i], scores[i]) for i in ranked_indices]
    return results

# === تحميل وتنظيف البيانات ===
df = load_dataset(DOCS_PATH)
# save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH)

# === بناء نموذج BM25 ===
build_bm25_model(CLEANED_TSV_PATH, BM25_MODEL_PATH)

# === تحميل واستدعاء النموذج ===
bm25_data = load_bm25_model(BM25_MODEL_PATH)

# === تحميل وتنظيف الاستعلامات ===
def load_and_clean_queries(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                query_id = parts[0]
                text = " ".join(parts[1:])
                data.append((query_id, text))
    df = pd.DataFrame(data, columns=["query_id", "text"])
    df["clean_text"] = df["text"].apply(clean_text)
    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")
    return df

queries_df = load_and_clean_queries(QUERIES_PATH)

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = line.strip().split("\t")
            if len(parts) < 2: continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

# === استرجاع المستندات باستخدام BM25 ===
retrieved_docs_dict = {}
for qid, tokens in tqdm(queries_tokens.items(), desc="🔍 استرجاع المستندات باستخدام BM25"):
    ranked = retrieve_bm25_docs(tokens, bm25_data, top_k=10)
    retrieved_docs_dict[qid] = ranked

# === دوال التقييم نفسها من كودك الأصلي ===

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall
    return np.mean(list(recall_scores.values())) * 100


def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

# === تحميل الـ qrels ===
qrel_dict, real_relevant = get_qrels(QRELS_PATH)

# === استرجاع المستندات بدون الدرجات فقط ===
retrieved_only_docs = get_retrieved_docs_formatted(retrieved_docs_dict)

# === تقييم الأداء ===
map_score = calculate_map(qrel_dict, retrieved_only_docs)
mrr_score = calculate_mrr(qrel_dict, retrieved_only_docs)
mean_precision = calculate_mean_precision(qrel_dict, retrieved_only_docs)
mean_recall = calculate_mean_recall(qrel_dict, real_relevant, retrieved_only_docs)
avg_p_at_k, p_at_k_dict = calculate_precision_at_k(qrel_dict, retrieved_only_docs, k=10)

print(f"MAP: {map_score:.2f}%")
print(f"MRR: {mrr_score:.2f}%")
print(f"Mean Precision: {mean_precision:.2f}%")
print(f"Mean Recall: {mean_recall:.2f}%")
print(f"Precision@10: {avg_p_at_k:.2f}%")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


تم بناء وحفظ نموذج BM25


🔍 استرجاع المستندات باستخدام BM25: 100%|██████████| 5000/5000 [1:08:36<00:00,  1.21it/s]


MAP: 70.65%
MRR: 74.75%
Mean Precision: 11.51%
Mean Recall: 85.50%
Precision@10: 11.51%


#BM25 WITH FACTORS

In [None]:
# تثبيت الحزم (مرة واحدة فقط)
!pip install nltk==3.8.1 contractions scikit-learn tqdm rank_bm25

# استيراد المكتبات
import os, re, string, joblib
import numpy as np
import pandas as pd
from collections import defaultdict
import nltk, contractions
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from tqdm import tqdm

# تحميل موارد NLTK (مرة واحدة)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# تعريف المسارات (عدل حسب مكان ملفاتك)
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir.tsv"
BM25_MODEL_PATH = f"/content/drive/MyDrive/utils/bm25_model/{DATASET_NAME}_bm25.joblib"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# الدوال المساعدة للتنظيف والـlemmatization (كما في كودك السابق)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessing = processing(text)
    tokenizing = tokenize(preprocessing)
    return tokenizing





def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def save_cleaned_docs(df, dataset_name="antique", path_out=CLEANED_TSV_PATH):
    """
    تنظيف المستندات وتخزينها في ملف TSV مع الأعمدة التالية:
    doc_id, text, processed_text, dataset_name, light_clean_text
    """
    cleaned_data = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
        doc_id = row["doc_id"]
        raw_text = row["text"]

        # التنظيف العميق
        processed_tokens = clean_text(raw_text)
        processed_text = " ".join(processed_tokens)

        # التنظيف الخفيف
        light_text = light_clean(raw_text)

        cleaned_data.append({
            "doc_id": doc_id,
            "text": raw_text,
            "processed_text": processed_text,
            "dataset_name": dataset_name,
            "light_clean_text": light_text
        })

    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(path_out, sep="\t", index=False)
    print(f"✅ تم حفظ الملف في: {path_out}")


def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

def load_queries(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                query_id = parts[0]
                text = " ".join(parts[1:])
                data.append((query_id, text))
    df = pd.DataFrame(data, columns=["query_id", "text"])
    df["clean_text"] = df["text"].apply(clean_text)
    with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")
    return df

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

def build_bm25(corpus, k1, b):
    return BM25Okapi(corpus, k1=k1, b=b)

def retrieve_bm25_docs(query_tokens, bm25_data, top_k=10):
    bm25 = bm25_data["bm25"]
    doc_ids = bm25_data["doc_ids"]
    scores = bm25.get_scores(query_tokens)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    results = [(doc_ids[i], scores[i]) for i in ranked_indices]
    return results

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, []))
        if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall
    return np.mean(list(recall_scores.values())) * 100

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

# ==== التنفيذ الكامل ====

# 1. تحميل وتنظيف المستندات (مرة واحدة فقط)
df = load_dataset(DOCS_PATH)
if not os.path.exists(CLEANED_TSV_PATH):
    save_cleaned_docs(df, CLEANED_TSV_PATH)
else:
    print("ملف المستندات المنظفة موجود بالفعل.")

# 2. تحميل البيانات النظيفة
df_cleaned = pd.read_csv(CLEANED_TSV_PATH, sep="\t")
corpus = [doc.split() if isinstance(doc, str) else [] for doc in df_cleaned["processed_text"]]
doc_ids = df_cleaned["doc_id"].astype(str).tolist()

# 3. تحميل وتنظيف الاستعلامات
queries_df = load_queries(QUERIES_PATH)
queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

# 4. تحميل الـ qrels للتقييم
qrel_dict, real_relevant = get_qrels(QRELS_PATH)

# 5. قائمة قيم k1 و b للاختبار
k1_values = [0.5, 1.0, 1.5, 2.0]
b_values = [0.2, 0.4, 0.6, 0.75, 0.8, 1.0]

print(f"{'k1':<5} {'b':<5} {'MAP':<8} {'MRR':<8} {'P@10':<8} {'MeanPrec':<10} {'MeanRecall'}")

# 6. تجربة كل توليفة وحساب التقييمات
for k1 in k1_values:
    for b in b_values:
        bm25 = build_bm25(corpus, k1, b)
        bm25_data = {"bm25": bm25, "doc_ids": doc_ids, "corpus": corpus}

        retrieved_docs_dict = {
            qid: retrieve_bm25_docs(tokens, bm25_data, top_k=10)
            for qid, tokens in queries_tokens.items()
        }

        retrieved_only_docs = get_retrieved_docs_formatted(retrieved_docs_dict)

        map_score = calculate_map(qrel_dict, retrieved_only_docs)
        mrr_score = calculate_mrr(qrel_dict, retrieved_only_docs)
        mean_precision = calculate_mean_precision(qrel_dict, retrieved_only_docs)
        mean_recall = calculate_mean_recall(qrel_dict, real_relevant, retrieved_only_docs)
        avg_p_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_only_docs, k=10)

        print(f"{k1:<5} {b:<5} {map_score:<8.2f} {mrr_score:<8.2f} {avg_p_at_k:<8.2f} {mean_precision:<10.2f} {mean_recall:.2f}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ملف المستندات المنظفة موجود بالفعل.
k1    b     MAP      MRR      P@10     MeanPrec   MeanRecall
0.5   0.2   66.84    70.52    11.14    11.14      82.98
0.5   0.4   69.44    73.32    11.37    11.37      84.42
0.5   0.6   70.97    74.95    11.48    11.48      85.27
0.5   0.75  71.42    75.43    11.56    11.56      85.78
0.5   0.8   71.51    75.51    11.59    11.59      85.90


  score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /


KeyboardInterrupt: 

#HYBRID

In [None]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm
!pip install country_converter --upgrade
!pip install datefinder
!pip install -U sentence-transformers

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from datetime import datetime
from tqdm import tqdm
import nltk

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"
embedding_file_path = f"/content/drive/MyDrive/embedding_model_joblib_file/{DATASET_NAME}_embedding_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

contractions_dict = {
    "u": "you", "r": "are", "wanna": "want to",
    "can't": "cannot", "don't": "do not", "didn't": "did not",
    "it's": "it is", "i'm": "i am"
}

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def processing(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = expand_contractions(text, contractions_dict)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos))
            for word, pos in tokens_pos
            if word not in stop_words and len(word) > 1]

def clean_text(text):
    return tokenize(processing(text))

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            parts = line.strip().split("\t")
            if len(parts) < 2: continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

# تحميل بيانات الاستعلامات
queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

# تحميل نموذج TF-IDF
tfidf_data = joblib.load(tfidf_model_path)
vectorizer = tfidf_data["vectorizer"]
X = tfidf_data["vectors"]
tfidf_doc_ids = tfidf_data["doc_ids"]

# تحميل نموذج Embedding والتمثيلات
embedding_model_path = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_path)
embedding_data = joblib.load(embedding_file_path)
doc_embeddings = embedding_data["embeddings"]
embedding_doc_ids = embedding_data["doc_ids"]

def represent_query_tfidf(tokens):
    return vectorizer.transform([" ".join(tokens)])

def represent_query_embedding(query_text):
    return embedding_model.encode([query_text], convert_to_numpy=True)[0]

def hybrid_rank(query_tokens, query_text, top_k=10, alpha=0.5):
    tfidf_vec = represent_query_tfidf(query_tokens)
    embedding_vec = represent_query_embedding(query_text)

    tfidf_sims = cosine_similarity(tfidf_vec, X).flatten()
    embedding_sims = cosine_similarity([embedding_vec], doc_embeddings).flatten()

    hybrid_sims = alpha * tfidf_sims + (1 - alpha) * embedding_sims
    top_indices = np.argsort(-hybrid_sims)[:top_k]
    return [(tfidf_doc_ids[i], hybrid_sims[i]) for i in top_indices]

# تطبيق الاسترجاع لجميع الاستعلامات
retrieved_docs_dict = {}
for qid, tokens in tqdm(queries_tokens.items(), desc="Hybrid Retrieval"):
    text = " ".join(tokens)
    results = hybrid_rank(tokens, text, top_k=10, alpha=0.5)
    retrieved_docs_dict[qid] = results

# التقييم (مثل السابق)
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0: relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0: continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, []))
        if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(dataset_name="quora"):
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, recall_scores = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, precision_scores = calculate_precision_at_k(qrel_dict, retrieved_docs_str, k=10)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# استدعاء التقييم
cal_evaluations("quora")



Collecting nltk==3.8.1
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
Hybrid Retrieval: 100%|██████████| 5000/5000 [1:23:51<00:00,  1.01s/it]



== Evaluation for dataset: quora ==
MAP: 74.16 %
MRR: 77.68 %
Mean Precision: 12.15 %
Mean Recall: 89.22 %
Mean Precision@10: 12.15 %


#EMBEDDING WITH VECTOR STORE

In [None]:
!pip install nltk==3.8.1 contractions scikit-learn tqdm faiss-cpu
!pip install -U sentence-transformers

import os, re, string, time, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

# ======== تحميل موارد NLTK ========
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# ======== إعداد المسارات ========
DATASET_NAME = "quora"
BASE_PATH = "/content/drive/MyDrive/dataset_quora_dev/"
DOCS_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/quora_cleaned_docs_beir"
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
BERT_MODEL_PATH = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
CLEAN_QUERIES_PATH = "light_cleaned_queries.tsv"
EMBEDDING_OUTPUT_PATH = "/content/drive/MyDrive/embedding_model_joblib_file"
EMBEDDING_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_beir.joblib")
FAISS_INDEX_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding.index")
FAISS_META_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_meta.json")
TOP_K = 50
RETRIEVAL_MODE = "faiss"  # اختر بين "cosine" أو "faiss"

bert_model = SentenceTransformer(BERT_MODEL_PATH)

# ======== تنظيف نصوص ========
def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ======== تحميل أو توليد تمثيلات المستندات ========
def load_or_generate_doc_embeddings():
    if os.path.exists(EMBEDDING_PATH):
        print(f"📦 تحميل التمثيلات من: {EMBEDDING_PATH}")
        return joblib.load(EMBEDDING_PATH)

    print("🚀 لم يتم العثور على التمثيلات، جاري التوليد...")
    df_docs = pd.read_csv(DOCS_TSV_PATH, sep="\t")
    print(f"🗂️ عدد المستندات المحملة: {len(df_docs)}")

    if "light_clean_text" not in df_docs.columns:
        df_docs["light_clean_text"] = df_docs["text"].astype(str).apply(light_clean)

    doc_embeddings = bert_model.encode(
        df_docs["light_clean_text"].tolist(),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    doc_ids = df_docs["doc_id"].astype(str).tolist()
    raw_docs = dict(zip(doc_ids, df_docs["text"]))

    os.makedirs(EMBEDDING_OUTPUT_PATH, exist_ok=True)
    joblib.dump({
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }, EMBEDDING_PATH)

    # حفظ metadata لـ FAISS لاحقاً
    with open(FAISS_META_PATH, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم حفظ التمثيلات في: {EMBEDDING_PATH}")
    return {
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }


def build_and_save_faiss_index(embeddings, index_path, meta_path, doc_ids):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)  # استخدم نوع الفهرس المناسب
    index.add(embeddings.astype("float32"))
    faiss.write_index(index, index_path)

    # حفظ الـ metadata (doc_ids)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم بناء وحفظ فهرس FAISS في: {index_path}")


# ======== تحميل الاستعلامات ========
def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")
    df = pd.read_csv(path, sep="\t", header=0, dtype=str)
    df["light_clean_text"] = df["text"].apply(light_clean)

    with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

def encode_queries(queries_tokens_dict, model):
    query_ids = list(queries_tokens_dict.keys())
    query_texts = [" ".join(tokens) for tokens in queries_tokens_dict.values()]
    embeddings = model.encode(query_texts, convert_to_numpy=True, normalize_embeddings=True)
    return dict(zip(query_ids, embeddings))

# ======== استرجاع ========
def retrieve_with_embedding(query_embeddings_dict, doc_embeddings, doc_ids, top_k=10):
    results = {}
    for qid, q_embed in query_embeddings_dict.items():
        sims = cosine_similarity([q_embed], doc_embeddings).flatten()
        ranked_idx = np.argsort(-sims)[:top_k]
        results[qid] = [(doc_ids[i], float(sims[i])) for i in ranked_idx]
    return results

def retrieve_with_faiss(query_embeddings_dict, index_path, doc_ids, top_k=10):
    index = faiss.read_index(index_path)
    query_ids = list(query_embeddings_dict.keys())
    query_embeddings = np.array([query_embeddings_dict[qid] for qid in query_ids]).astype("float32")
    scores, indices = index.search(query_embeddings, top_k)

    results = {}
    for i, qid in enumerate(query_ids):
        results[qid] = [(doc_ids[idx], float(scores[i][j])) for j, idx in enumerate(indices[i])]
    return results

def load_doc_ids_from_meta(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# ======== تحميل QRELS ========
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

# ======== التقييم ========
def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(retrieved_docs, dataset_name="quora"):
    print("\n📊 بدء التقييم...")
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs).items()}

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ======== التشغيل الكامل ========
if __name__ == "__main__":
    embedding_data = load_or_generate_doc_embeddings()
    build_and_save_faiss_index(embedding_data["embeddings"], FAISS_INDEX_PATH, FAISS_META_PATH, embedding_data["doc_ids"])
    doc_embeddings = embedding_data["embeddings"]
    doc_ids = embedding_data["doc_ids"]

    queries_df = load_and_clean_queries(QUERIES_PATH)
    queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES_PATH)
    query_embeddings = encode_queries(queries_tokens, model=bert_model)

    print(f"🧠 تم تمثيل {len(query_embeddings)} استعلام")

    if RETRIEVAL_MODE == "cosine":
        print("\n🚀 الاسترجاع باستخدام cosine_similarity...")
        start = time.time()
        retrieved_docs = retrieve_with_embedding(query_embeddings, doc_embeddings, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    elif RETRIEVAL_MODE == "faiss":
        print("\n⚡ الاسترجاع باستخدام FAISS...")
        doc_ids = load_doc_ids_from_meta(FAISS_META_PATH)
        start = time.time()
        retrieved_docs = retrieve_with_faiss(query_embeddings, FAISS_INDEX_PATH, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    else:
        raise ValueError(f"❌ Retrieval mode '{RETRIEVAL_MODE}' not supported.")

    print(f"📥 تم استرجاع نتائج لـ {len(retrieved_docs)} استعلام في {round(duration, 2)} ثانية")

    cal_evaluations(retrieved_docs, dataset_name=f"{DATASET_NAME}-{RETRIEVAL_MODE}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


📦 تحميل التمثيلات من: /content/drive/MyDrive/embedding_model_joblib_file/quora_embedding_beir.joblib
✅ تم بناء وحفظ فهرس FAISS في: /content/drive/MyDrive/embedding_model_joblib_file/quora_embedding.index
📥 تحميل الاستعلامات من: /content/drive/MyDrive/dataset_quora_dev/queries.tsv
🧠 تم تمثيل 5000 استعلام

⚡ الاسترجاع باستخدام FAISS...
📥 تم استرجاع نتائج لـ 5000 استعلام في 38.29 ثانية

📊 بدء التقييم...

== Evaluation for dataset: quora-faiss ==
MAP: 84.07 %
MRR: 86.57 %
Mean Precision: 2.93 %
Mean Recall: 98.84 %
Mean Precision@10: 13.22 %


#EMBEDDING WITH COSINE

In [None]:
!pip install nltk==3.8.1 contractions scikit-learn tqdm faiss-cpu
!pip install -U sentence-transformers

import os, re, string, time, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

# ======== تحميل موارد NLTK ========
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# ======== إعداد المسارات ========
DATASET_NAME = "quora"
BASE_PATH = "/content/drive/MyDrive/dataset_quora_dev/"
DOCS_TSV_PATH = "/content/drive/MyDrive/utils/clean_docs/quora_cleaned_docs_beir"
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
BERT_MODEL_PATH = "/content/drive/MyDrive/embedding_models/Bert_model/all-MiniLM-L6-v2"
CLEAN_QUERIES_PATH = "light_cleaned_queries.tsv"
EMBEDDING_OUTPUT_PATH = "/content/drive/MyDrive/embedding_model_joblib_file"
EMBEDDING_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_beir.joblib")
FAISS_INDEX_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding.index")
FAISS_META_PATH = os.path.join(EMBEDDING_OUTPUT_PATH, f"{DATASET_NAME}_embedding_meta.json")
TOP_K = 50
RETRIEVAL_MODE = "cosine"  # اختر بين "cosine" أو "faiss"

bert_model = SentenceTransformer(BERT_MODEL_PATH)

# ======== تنظيف نصوص ========
def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ======== تحميل أو توليد تمثيلات المستندات ========
def load_or_generate_doc_embeddings():
    if os.path.exists(EMBEDDING_PATH):
        print(f"📦 تحميل التمثيلات من: {EMBEDDING_PATH}")
        return joblib.load(EMBEDDING_PATH)

    print("🚀 لم يتم العثور على التمثيلات، جاري التوليد...")
    df_docs = pd.read_csv(DOCS_TSV_PATH, sep="\t")
    print(f"🗂️ عدد المستندات المحملة: {len(df_docs)}")

    if "light_clean_text" not in df_docs.columns:
        df_docs["light_clean_text"] = df_docs["text"].astype(str).apply(light_clean)

    doc_embeddings = bert_model.encode(
        df_docs["light_clean_text"].tolist(),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    doc_ids = df_docs["doc_id"].astype(str).tolist()
    raw_docs = dict(zip(doc_ids, df_docs["text"]))

    os.makedirs(EMBEDDING_OUTPUT_PATH, exist_ok=True)
    joblib.dump({
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }, EMBEDDING_PATH)

    # حفظ metadata لـ FAISS لاحقاً
    with open(FAISS_META_PATH, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم حفظ التمثيلات في: {EMBEDDING_PATH}")
    return {
        "embeddings": doc_embeddings,
        "doc_ids": doc_ids,
        "raw_docs": raw_docs
    }


def build_and_save_faiss_index(embeddings, index_path, meta_path, doc_ids):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)  # استخدم نوع الفهرس المناسب
    index.add(embeddings.astype("float32"))
    faiss.write_index(index, index_path)

    # حفظ الـ metadata (doc_ids)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(doc_ids, f)

    print(f"✅ تم بناء وحفظ فهرس FAISS في: {index_path}")


# ======== تحميل الاستعلامات ========
def load_and_clean_queries(path):
    print(f"📥 تحميل الاستعلامات من: {path}")
    df = pd.read_csv(path, sep="\t", header=0, dtype=str)
    df["light_clean_text"] = df["text"].apply(light_clean)

    with open(CLEAN_QUERIES_PATH, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row['query_id']}\t{row['light_clean_text']}\n")

    return df

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

def encode_queries(queries_tokens_dict, model):
    query_ids = list(queries_tokens_dict.keys())
    query_texts = [" ".join(tokens) for tokens in queries_tokens_dict.values()]
    embeddings = model.encode(query_texts, convert_to_numpy=True, normalize_embeddings=True)
    return dict(zip(query_ids, embeddings))

# ======== استرجاع ========
def retrieve_with_embedding(query_embeddings_dict, doc_embeddings, doc_ids, top_k=10):
    results = {}
    for qid, q_embed in query_embeddings_dict.items():
        sims = cosine_similarity([q_embed], doc_embeddings).flatten()
        ranked_idx = np.argsort(-sims)[:top_k]
        results[qid] = [(doc_ids[i], float(sims[i])) for i in ranked_idx]
    return results

def retrieve_with_faiss(query_embeddings_dict, index_path, doc_ids, top_k=10):
    index = faiss.read_index(index_path)
    query_ids = list(query_embeddings_dict.keys())
    query_embeddings = np.array([query_embeddings_dict[qid] for qid in query_ids]).astype("float32")
    scores, indices = index.search(query_embeddings, top_k)

    results = {}
    for i, qid in enumerate(query_ids):
        results[qid] = [(doc_ids[idx], float(scores[i][j])) for j, idx in enumerate(indices[i])]
    return results

def load_doc_ids_from_meta(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# ======== تحميل QRELS ========
def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid, docid, rel = str(row["query_id"]), str(row["doc_id"]), row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

# ======== التقييم ========
def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrels, retrieved):
    ap_list = []
    for qid, rel_docs in qrels.items():
        num_relevant = sum(1 for r in rel_docs.values() if r > 0)
        if num_relevant == 0:
            continue
        ret_docs = retrieved.get(qid, [])
        hits, score = 0, 0
        for i, d in enumerate(ret_docs, 1):
            if d in rel_docs and rel_docs[d] > 0:
                hits += 1
                score += hits / i
        ap_list.append(score / num_relevant)
    return np.mean(ap_list) * 100 if ap_list else 0

def calculate_mrr(qrels, retrieved):
    scores = []
    for qid, rel_docs in qrels.items():
        for i, d in enumerate(retrieved.get(qid, []), 1):
            if d in rel_docs and rel_docs[d] > 0:
                scores.append(1 / i)
                break
        else:
            scores.append(0)
    return np.mean(scores) * 100

def calculate_mean_precision(qrels, retrieved):
    return np.mean([
        sum(1 for d in retrieved.get(qid, []) if d in rel_docs and rel_docs[d] > 0) / len(retrieved.get(qid, [])) if retrieved.get(qid) else 0
        for qid, rel_docs in qrels.items()
    ]) * 100

def calculate_mean_recall(qrels, real_relevant, retrieved):
    recall_scores = {}
    for qid in retrieved:
        ret_docs = set(retrieved[qid])
        rel_docs = real_relevant.get(qid, set())
        recall = len(ret_docs & rel_docs) / len(rel_docs) if rel_docs else 0
        recall_scores[qid] = recall * 100
    return np.mean(list(recall_scores.values())), recall_scores

def calculate_precision_at_k(qrels, retrieved, k=10):
    precisions = {}
    for qid in retrieved:
        ret_docs = retrieved[qid][:k]
        rel_docs = qrels.get(qid, {})
        hits = sum(1 for d in ret_docs if d in rel_docs and rel_docs[d] > 0)
        precisions[qid] = (hits / k) * 100
    return np.mean(list(precisions.values())), precisions

def cal_evaluations(retrieved_docs, dataset_name="quora"):
    print("\n📊 بدء التقييم...")
    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = {qid: [str(docid) for docid in docs] for qid, docs in get_retrieved_docs_formatted(retrieved_docs).items()}

    print(f"\n== Evaluation for dataset: {dataset_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ======== التشغيل الكامل ========
if __name__ == "__main__":
    embedding_data = load_or_generate_doc_embeddings()
    build_and_save_faiss_index(embedding_data["embeddings"], FAISS_INDEX_PATH, FAISS_META_PATH, embedding_data["doc_ids"])
    doc_embeddings = embedding_data["embeddings"]
    doc_ids = embedding_data["doc_ids"]

    queries_df = load_and_clean_queries(QUERIES_PATH)
    queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES_PATH)
    query_embeddings = encode_queries(queries_tokens, model=bert_model)

    print(f"🧠 تم تمثيل {len(query_embeddings)} استعلام")

    if RETRIEVAL_MODE == "cosine":
        print("\n🚀 الاسترجاع باستخدام cosine_similarity...")
        start = time.time()
        retrieved_docs = retrieve_with_embedding(query_embeddings, doc_embeddings, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    elif RETRIEVAL_MODE == "faiss":
        print("\n⚡ الاسترجاع باستخدام FAISS...")
        doc_ids = load_doc_ids_from_meta(FAISS_META_PATH)
        start = time.time()
        retrieved_docs = retrieve_with_faiss(query_embeddings, FAISS_INDEX_PATH, doc_ids, top_k=TOP_K)
        duration = time.time() - start
    else:
        raise ValueError(f"❌ Retrieval mode '{RETRIEVAL_MODE}' not supported.")

    print(f"📥 تم استرجاع نتائج لـ {len(retrieved_docs)} استعلام في {round(duration, 2)} ثانية")

    cal_evaluations(retrieved_docs, dataset_name=f"{DATASET_NAME}-{RETRIEVAL_MODE}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


📦 تحميل التمثيلات من: /content/drive/MyDrive/embedding_model_joblib_file/quora_embedding_beir.joblib
✅ تم بناء وحفظ فهرس FAISS في: /content/drive/MyDrive/embedding_model_joblib_file/quora_embedding.index
📥 تحميل الاستعلامات من: /content/drive/MyDrive/dataset_quora_dev/queries.tsv
🧠 تم تمثيل 5000 استعلام

🚀 الاسترجاع باستخدام cosine_similarity...
📥 تم استرجاع نتائج لـ 5000 استعلام في 4256.87 ثانية

📊 بدء التقييم...

== Evaluation for dataset: quora-cosine ==
MAP: 84.06 %
MRR: 86.56 %
Mean Precision: 2.93 %
Mean Recall: 98.84 %
Mean Precision@10: 13.21 %


In [None]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm
!pip install country_converter --upgrade
!pip install datefinder

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from tqdm import tqdm
import nltk
import contractions

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

# دالة توسيع الانقباضات (contractions)
def expand_contractions(text):
    return contractions.fix(text)

# دالة تنظيف النصوص مع توسيع الانقباضات
def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessed = processing(text)
    tokenized = tokenize(preprocessed)
    return tokenized

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# # تحميل البيانات وتخزينها بعد التنظيف
# def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH):
#     cleaned_data = []
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
#         doc_id = row["doc_id"]
#         raw_text = row["text"]
#         processed_tokens = clean_text(raw_text)
#         processed_text = " ".join(processed_tokens)
#         light_text = light_clean(raw_text)
#         cleaned_data.append({
#             "doc_id": doc_id,
#             "text": raw_text,
#             "processed_text": processed_text,
#             "dataset_name": dataset_name,
#             "light_clean_text": light_text
#         })
#     cleaned_df = pd.DataFrame(cleaned_data)
#     cleaned_df.to_csv(path_out, sep="\t", index=False)

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)
# save_cleaned_docs(df)

# # بناء فهرس عكسي من المستندات المنظفة
# def build_inverted_index_from_tsv(tsv_path=CLEANED_TSV_PATH):
#     df = pd.read_csv(tsv_path, sep="\t")
#     index = defaultdict(set)
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🔧 بناء الفهرس"):
#         doc_id = str(row["doc_id"])
#         tokens = str(row["processed_text"]).split()
#         for token in tokens:
#             index[token].add(doc_id)
#     os.makedirs(os.path.dirname(INVERTED_PATH), exist_ok=True)
#     with open(INVERTED_PATH, "w", encoding="utf-8") as f:
#         json.dump({k: list(v) for k, v in index.items()}, f, ensure_ascii=False, indent=2)
#     return index

# inverted_index = build_inverted_index_from_tsv()

# تحميل وتنظيف الاستعلامات
# def load_and_clean_queries(path):
#     data = []
#     with open(path, "r", encoding="utf-8") as f:
#         for line in f:
#             parts = line.strip().split("\t")
#             if len(parts) >= 2:
#                 query_id = parts[0]
#                 text = " ".join(parts[1:])
#                 data.append((query_id, text))
#     df = pd.DataFrame(data, columns=["query_id", "text"])
#     df["clean_text"] = df["text"].apply(clean_text)
#     with open(CLEAN_QUERIES, "w", encoding="utf-8") as f:
#         for _, row in df.iterrows():
#             f.write(f"{row['query_id']}\t{' '.join(row['clean_text'])}\n")
#     return df

# queries_df = load_and_clean_queries(QUERIES_PATH)

# تحميل التوكينات للاستعلامات
def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

# دالة استرجاع المستندات عبر الفهرس العكسي
def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

# تحميل الفهرس العكسي
def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)

# تجهيز نموذج TF-IDF
tqdm.pandas(desc=">> تجهيز النصوص")

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    vectorizer = TfidfVectorizer(
        preprocessor=processing,
        tokenizer=tokenize,
        lowercase=False,
        token_pattern=None
    )
    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]

def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]

# دالة توسعة الاستعلام: تضيف المرادفات عبر WordNet لكل كلمة
from nltk.corpus import wordnet

def query_expansion(tokens):
    expanded_tokens = set(tokens)
    for token in tokens:
        synsets = wordnet.synsets(token)
        for syn in synsets:
            for lemma in syn.lemmas():
                lemma_name = lemma.name().replace('_', ' ').lower()
                if lemma_name not in stop_words and lemma_name.isalpha():
                    expanded_tokens.add(lemma_name)
    return list(expanded_tokens)

# دالة تصحيح بسيطة: حذف الكلمات غير الإنجليزية أو غير الأبجدية (مثال توضيحي)
def correct_tokens(tokens):
    corrected = []
    for t in tokens:
        if t.isalpha() and len(t) > 1:
            corrected.append(t)
    return corrected

# دوال لتجهيز الاستعلامات بأنظمة مختلفة

def process_queries_original(queries_tokens):
    # النظام الأصلي بدون توسعة أو تصحيح
    return queries_tokens

def process_queries_with_features(queries_tokens):
    # النظام مع توسعة الاستعلام وتصحيح التوكنات
    new_queries = {}
    for qid, tokens in queries_tokens.items():
        corrected = correct_tokens(tokens)
        expanded = query_expansion(corrected)
        new_queries[qid] = expanded
    return new_queries

# دوال التقييم

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrel, retrieved):
    average_precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        if not relevant_docs:
            continue
        num_relevant = 0
        precision_sum = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                num_relevant += 1
                precision_sum += num_relevant / i
        if num_relevant > 0:
            average_precisions.append(precision_sum / len(relevant_docs))
        else:
            average_precisions.append(0)
    if average_precisions:
        return np.mean(average_precisions) * 100
    return 0

def calculate_mrr(qrel, retrieved):
    rr_list = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        rr = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                rr = 1 / i
                break
        rr_list.append(rr)
    if rr_list:
        return np.mean(rr_list) * 100
    return 0

def calculate_mean_precision(qrel, retrieved):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        num_retrieved = len(retrieved_docs)
        if num_retrieved == 0:
            precisions.append(0)
        else:
            precisions.append(num_relevant / num_retrieved)
    if precisions:
        return np.mean(precisions) * 100
    return 0

def calculate_mean_recall(qrel, real_relevant, retrieved):
    recalls = []
    for qid in qrel:
        relevant_docs = real_relevant.get(qid, set())
        retrieved_docs = retrieved.get(qid, [])
        if len(relevant_docs) == 0:
            recalls.append(0)
            continue
        retrieved_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        recalls.append(retrieved_relevant / len(relevant_docs))
    if recalls:
        return np.mean(recalls) * 100, recalls
    return 0, []

def calculate_precision_at_k(qrel, retrieved, k=10):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])[:k]
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        precisions.append(num_relevant / k)
    if precisions:
        return np.mean(precisions) * 100, precisions
    return 0, []

# دالة التقييم العامة للنظام (قبل وبعد)

def evaluate_system(queries_tokens, system_name="Original"):
    retrieved_docs_dict = {}
    for qid, tokens in tqdm(queries_tokens.items(), desc=f"استخراج المستندات للنظام: {system_name}"):
        retrieved = retrieve_docs(tokens, inverted_index)
        query_vec = represent_query(tokens)
        ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
        retrieved_docs_dict[qid] = ranked

    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== تقييم النظام: {system_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# === تشغيل المقارنة ===

queries_original = process_queries_original(queries_tokens)
queries_with_features = process_queries_with_features(queries_tokens)

evaluate_system(queries_original, "النظام الأصلي (بدون توسعة أو تصحيح)")
evaluate_system(queries_with_features, "النظام مع توسعة الاستعلام وتصحيح التوكنات")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
استخراج المستندات للنظام: النظام الأصلي (بدون توسعة أو تصحيح): 100%|██████████| 5000/5000 [17:35<00:00,  4.74it/s]



== تقييم النظام: النظام الأصلي (بدون توسعة أو تصحيح) ==
MAP: 66.6 %
MRR: 70.66 %
Mean Precision: 11.03 %
Mean Recall: 81.62 %
Mean Precision@10: 10.92 %


استخراج المستندات للنظام: النظام مع توسعة الاستعلام وتصحيح التوكنات: 100%|██████████| 5000/5000 [23:06<00:00,  3.61it/s]



== تقييم النظام: النظام مع توسعة الاستعلام وتصحيح التوكنات ==
MAP: 45.0 %
MRR: 47.94 %
Mean Precision: 8.31 %
Mean Recall: 63.39 %
Mean Precision@10: 8.21 %


In [None]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm textblob
!python -m textblob.download_corpora

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import nltk
import contractions
from textblob import TextBlob

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def expand_contractions(text):
    return contractions.fix(text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessed = processing(text)
    tokenized = tokenize(preprocessed)
    return tokenized

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH):
#     cleaned_data = []
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
#         doc_id = row["doc_id"]
#         raw_text = row["text"]
#         processed_tokens = clean_text(raw_text)
#         processed_text = " ".join(processed_tokens)
#         light_text = light_clean(raw_text)
#         cleaned_data.append({
#             "doc_id": doc_id,
#             "text": raw_text,
#             "processed_text": processed_text,
#             "dataset_name": dataset_name,
#             "light_clean_text": light_text
#         })
#     cleaned_df = pd.DataFrame(cleaned_data)
#     cleaned_df.to_csv(path_out, sep="\t", index=False)

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)
# save_cleaned_docs(df)

# def build_inverted_index_from_tsv(tsv_path=CLEANED_TSV_PATH):
#     df = pd.read_csv(tsv_path, sep="\t")
#     index = defaultdict(set)
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🔧 بناء الفهرس"):
#         doc_id = str(row["doc_id"])
#         tokens = str(row["processed_text"]).split()
#         for token in tokens:
#             index[token].add(doc_id)
#     os.makedirs(os.path.dirname(INVERTED_PATH), exist_ok=True)
#     with open(INVERTED_PATH, "w", encoding="utf-8") as f:
#         json.dump({k: list(v) for k, v in index.items()}, f, ensure_ascii=False, indent=2)
#     return index

# inverted_index = build_inverted_index_from_tsv()

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    vectorizer = TfidfVectorizer(
        preprocessor=processing,
        tokenizer=tokenize,
        lowercase=False,
        token_pattern=None
    )
    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]

def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]

# === ✅ ميزة تصحيح الاستعلام فقط ===

def correct_query_spelling(tokens):
    corrected = []
    for word in tokens:
        blob = TextBlob(word)
        correction = str(blob.correct())
        corrected.append(correction if correction else word)
    return corrected

def process_queries_with_correction_only(queries_tokens):
    corrected_queries = {}
    for qid, tokens in queries_tokens.items():
        corrected_tokens = correct_query_spelling(tokens)
        corrected_queries[qid] = corrected_tokens
    return corrected_queries

# === ✅ التقييم ===

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrel, retrieved):
    average_precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        if not relevant_docs:
            continue
        num_relevant = 0
        precision_sum = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                num_relevant += 1
                precision_sum += num_relevant / i
        if num_relevant > 0:
            average_precisions.append(precision_sum / len(relevant_docs))
        else:
            average_precisions.append(0)
    return np.mean(average_precisions) * 100 if average_precisions else 0

def calculate_mrr(qrel, retrieved):
    rr_list = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        rr = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                rr = 1 / i
                break
        rr_list.append(rr)
    return np.mean(rr_list) * 100 if rr_list else 0

def calculate_mean_precision(qrel, retrieved):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        num_retrieved = len(retrieved_docs)
        precisions.append(num_relevant / num_retrieved if num_retrieved > 0 else 0)
    return np.mean(precisions) * 100 if precisions else 0

def calculate_mean_recall(qrel, real_relevant, retrieved):
    recalls = []
    for qid in qrel:
        relevant_docs = real_relevant.get(qid, set())
        retrieved_docs = retrieved.get(qid, [])
        if len(relevant_docs) == 0:
            recalls.append(0)
            continue
        retrieved_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        recalls.append(retrieved_relevant / len(relevant_docs))
    return np.mean(recalls) * 100 if recalls else 0, recalls

def calculate_precision_at_k(qrel, retrieved, k=10):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])[:k]
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        precisions.append(num_relevant / k)
    return np.mean(precisions) * 100 if precisions else 0, precisions

# === ✅ تشغيل النظام مع التصحيح فقط ===

queries_with_correction = process_queries_with_correction_only(queries_tokens)

def evaluate_system(queries_tokens, system_name="With Spelling Correction Only"):
    retrieved_docs_dict = {}
    for qid, tokens in tqdm(queries_tokens.items(), desc=f"استخراج المستندات للنظام: {system_name}"):
        retrieved = retrieve_docs(tokens, inverted_index)
        query_vec = represent_query(tokens)
        ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
        retrieved_docs_dict[qid] = ranked

    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== تقييم النظام: {system_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# تشغيل التقييم
evaluate_system(queries_with_correction, "النظام مع التصحيح فقط")


INFO: pip is looking at multiple versions of textblob to determine which version is compatible with other requirements. This could take a while.
Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: textblob
  Attempting uninstall: textblob
    Found existing installation: textblob 0.19.0
    Uninstalling textblob-0.19.0:
      Successfully uninstalled textblob-0.19.0
Successfully installed textblob-0.18.0.post0
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
استخراج المستندات للنظام: النظام مع التصحيح فقط: 100%|██████████| 5000/5000 [18:48<00:00,  4.43it/s]



== تقييم النظام: النظام مع التصحيح فقط ==
MAP: 51.31 %
MRR: 54.71 %
Mean Precision: 8.91 %
Mean Recall: 66.54 %
Mean Precision@10: 8.84 %


In [None]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm
!pip install country_converter --upgrade
!pip install datefinder

# استيراد المكتبات
import os, re, string, json, joblib
import numpy as np, pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from tqdm import tqdm
import nltk
import contractions

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def expand_contractions(text):
    return contractions.fix(text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessed = processing(text)
    tokenized = tokenize(preprocessed)
    return tokenized

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# def save_cleaned_docs(df, dataset_name=DATASET_NAME, path_out=CLEANED_TSV_PATH):
#     cleaned_data = []
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🧼 تنظيف وتخزين المستندات"):
#         doc_id = row["doc_id"]
#         raw_text = row["text"]
#         processed_tokens = clean_text(raw_text)
#         processed_text = " ".join(processed_tokens)
#         light_text = light_clean(raw_text)
#         cleaned_data.append({
#             "doc_id": doc_id,
#             "text": raw_text,
#             "processed_text": processed_text,
#             "dataset_name": dataset_name,
#             "light_clean_text": light_text
#         })
#     cleaned_df = pd.DataFrame(cleaned_data)
#     cleaned_df.to_csv(path_out, sep="\t", index=False)

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)
# save_cleaned_docs(df)

# def build_inverted_index_from_tsv(tsv_path=CLEANED_TSV_PATH):
#     df = pd.read_csv(tsv_path, sep="\t")
#     index = defaultdict(set)
#     for _, row in tqdm(df.iterrows(), total=len(df), desc="🔧 بناء الفهرس"):
#         doc_id = str(row["doc_id"])
#         tokens = str(row["processed_text"]).split()
#         for token in tokens:
#             index[token].add(doc_id)
#     os.makedirs(os.path.dirname(INVERTED_PATH), exist_ok=True)
#     with open(INVERTED_PATH, "w", encoding="utf-8") as f:
#         json.dump({k: list(v) for k, v in index.items()}, f, ensure_ascii=False, indent=2)
#     return index

# inverted_index = build_inverted_index_from_tsv()

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)

tqdm.pandas(desc=">> تجهيز النصوص")

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    vectorizer = TfidfVectorizer(
        preprocessor=processing,
        tokenizer=tokenize,
        lowercase=False,
        token_pattern=None
    )
    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]

def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]

# === ✅ ميزة توسعة الاستعلام فقط ===

def query_expansion(tokens):
    expanded_tokens = set(tokens)
    for token in tokens:
        synsets = wordnet.synsets(token)
        for syn in synsets:
            for lemma in syn.lemmas():
                lemma_name = lemma.name().replace('_', ' ').lower()
                if lemma_name not in stop_words and lemma_name.isalpha():
                    expanded_tokens.add(lemma_name)
    return list(expanded_tokens)

def process_queries_with_expansion_only(queries_tokens):
    new_queries = {}
    for qid, tokens in queries_tokens.items():
        expanded = query_expansion(tokens)
        new_queries[qid] = expanded
    return new_queries

# === ✅ التقييم ===

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrel, retrieved):
    average_precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        if not relevant_docs:
            continue
        num_relevant = 0
        precision_sum = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                num_relevant += 1
                precision_sum += num_relevant / i
        if num_relevant > 0:
            average_precisions.append(precision_sum / len(relevant_docs))
        else:
            average_precisions.append(0)
    if average_precisions:
        return np.mean(average_precisions) * 100
    return 0

def calculate_mrr(qrel, retrieved):
    rr_list = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        rr = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                rr = 1 / i
                break
        rr_list.append(rr)
    if rr_list:
        return np.mean(rr_list) * 100
    return 0

def calculate_mean_precision(qrel, retrieved):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        num_retrieved = len(retrieved_docs)
        if num_retrieved == 0:
            precisions.append(0)
        else:
            precisions.append(num_relevant / num_retrieved)
    if precisions:
        return np.mean(precisions) * 100
    return 0

def calculate_mean_recall(qrel, real_relevant, retrieved):
    recalls = []
    for qid in qrel:
        relevant_docs = real_relevant.get(qid, set())
        retrieved_docs = retrieved.get(qid, [])
        if len(relevant_docs) == 0:
            recalls.append(0)
            continue
        retrieved_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        recalls.append(retrieved_relevant / len(relevant_docs))
    if recalls:
        return np.mean(recalls) * 100, recalls
    return 0, []

def calculate_precision_at_k(qrel, retrieved, k=10):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])[:k]
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        precisions.append(num_relevant / k)
    if precisions:
        return np.mean(precisions) * 100, precisions
    return 0, []

# === ✅ تشغيل النظام مع التوسعة فقط ===

queries_with_expansion = process_queries_with_expansion_only(queries_tokens)

def evaluate_system(queries_tokens, system_name="With Expansion Only"):
    retrieved_docs_dict = {}
    for qid, tokens in tqdm(queries_tokens.items(), desc=f"استخراج المستندات للنظام: {system_name}"):
        retrieved = retrieve_docs(tokens, inverted_index)
        query_vec = represent_query(tokens)
        ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
        retrieved_docs_dict[qid] = ranked

    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== تقييم النظام: {system_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# تشغيل التقييم
evaluate_system(queries_with_expansion, "النظام مع توسعة الاستعلام فقط")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
استخراج المستندات للنظام: النظام مع توسعة الاستعلام فقط: 100%|██████████| 5000/5000 [26:46<00:00,  3.11it/s]



== تقييم النظام: النظام مع توسعة الاستعلام فقط ==
MAP: 46.03 %
MRR: 49.04 %
Mean Precision: 8.47 %
Mean Recall: 64.52 %
Mean Precision@10: 8.37 %


In [None]:
# تثبيت الحزم
!pip install nltk==3.8.1 contractions scikit-learn tqdm fuzzywuzzy[speedup]
!pip install country_converter --upgrade
!pip install datefinder

# استيراد المكتبات
import os, re, string, json, joblib
from collections import defaultdict, Counter
import numpy as np, pandas as pd
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import nltk
import contractions
from fuzzywuzzy import process, fuzz

# تحميل موارد NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# إعداد المسارات
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def expand_contractions(text):
    return contractions.fix(text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessed = processing(text)
    tokenized = tokenize(preprocessed)
    return tokenized

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)

tqdm.pandas(desc=">> تجهيز النصوص")

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    vectorizer = TfidfVectorizer(
        preprocessor=processing,
        tokenizer=tokenize,
        lowercase=False,
        token_pattern=None
    )
    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]

def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]

# ===== دمج QueryRefiner مع توسعة الاستعلام =====

# قراءة المستندات المعالجة
processed_docs_df = pd.read_csv(CLEANED_TSV_PATH, sep="\t")

# استخراج جميع المصطلحات من العمود processed_text
all_processed_terms = [
    term
    for doc in processed_docs_df["processed_text"]
    for term in str(doc).split()
]

term_frequencies = Counter(all_processed_terms)
processed_terms_set = set(term_frequencies)

class QueryRefiner:
    def __init__(self, processed_terms):
        self.term_frequencies = Counter(processed_terms)
        self.processed_terms = set(processed_terms)
        self.stop_words = set(stopwords.words("english"))

    def reduce_repeated_letters(self, word):
        return re.sub(r'(.)\1{2,}', r'\1', word)

    def suggest_correction(self, query):
        words = query.split()
        corrected = []
        for word in words:
            lw = self.reduce_repeated_letters(word.lower())
            if lw in self.stop_words or all(c in string.punctuation for c in lw):
                corrected.append(word)
                continue
            best_match = process.extractOne(lw, self.processed_terms, scorer=fuzz.ratio)
            if best_match and best_match[1] > 85:
                corrected.append(best_match[0])
            else:
                corrected.append(word)
        corrected_query = ' '.join(corrected)
        if corrected_query.lower() != query.lower():
            return corrected_query
        return None

    def get_synonyms(self, word):
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                name = lemma.name().lower()
                if (
                    name != word.lower()
                    and name in self.processed_terms
                    and len(name) > 2
                    and name.isalpha()
                ):
                    synonyms.add(name)
        return list(synonyms)

    def expand_query(self, query):
        words = query.split()
        expanded_terms = set()
        for word in words:
            corrected_word = self.suggest_correction(word)
            base_word = corrected_word if corrected_word is not None else word
            base_word = base_word.lower()
            if base_word in self.processed_terms:
                expanded_terms.add(base_word)
                syns = self.get_synonyms(base_word)
                expanded_terms.update(syns)
        if not expanded_terms:
            return query
        return ' '.join(expanded_terms)

# إنشاء كائن QueryRefiner
query_refiner = QueryRefiner(all_processed_terms)

# تعديل دالة توسعة الاستعلام لتستخدم QueryRefiner
def process_queries_with_expansion_refiner(queries_tokens):
    new_queries = {}
    for qid, tokens in queries_tokens.items():
        original_query = " ".join(tokens)
        expanded_query = query_refiner.expand_query(original_query)
        # رجعنا كـ list من الكلمات
        new_queries[qid] = expanded_query.split()
    return new_queries

# === تقييم النظام مع توسعة الاستعلام بواسطة QueryRefiner ===

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrel, retrieved):
    average_precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        if not relevant_docs:
            continue
        num_relevant = 0
        precision_sum = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                num_relevant += 1
                precision_sum += num_relevant / i
        if num_relevant > 0:
            average_precisions.append(precision_sum / len(relevant_docs))
        else:
            average_precisions.append(0)
    if average_precisions:
        return np.mean(average_precisions) * 100
    return 0

def calculate_mrr(qrel, retrieved):
    rr_list = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        rr = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                rr = 1 / i
                break
        rr_list.append(rr)
    if rr_list:
        return np.mean(rr_list) * 100
    return 0

def calculate_mean_precision(qrel, retrieved):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        num_retrieved = len(retrieved_docs)
        if num_retrieved == 0:
            precisions.append(0)
        else:
            precisions.append(num_relevant / num_retrieved)
    if precisions:
        return np.mean(precisions) * 100
    return 0

def calculate_mean_recall(qrel, real_relevant, retrieved):
    recalls = []
    for qid in qrel:
        relevant_docs = real_relevant.get(qid, set())
        retrieved_docs = retrieved.get(qid, [])
        if len(relevant_docs) == 0:
            recalls.append(0)
            continue
        retrieved_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        recalls.append(retrieved_relevant / len(relevant_docs))
    if recalls:
        return np.mean(recalls) * 100, recalls
    return 0, []

def calculate_precision_at_k(qrel, retrieved, k=10):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])[:k]
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        precisions.append(num_relevant / k)
    if precisions:
        return np.mean(precisions) * 100, precisions
    return 0, []

# تشغيل النظام مع توسعة الاستعلام باستخدام QueryRefiner
queries_with_expansion = process_queries_with_expansion_refiner(queries_tokens)

def evaluate_system(queries_tokens, system_name="With QueryRefiner Expansion"):
    retrieved_docs_dict = {}
    for qid, tokens in tqdm(queries_tokens.items(), desc=f"استخراج المستندات للنظام: {system_name}"):
        retrieved = retrieve_docs(tokens, inverted_index)
        query_vec = represent_query(tokens)
        ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
        retrieved_docs_dict[qid] = ranked

    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== تقييم النظام: {system_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# تقييم النظام مع توسعة الاستعلام بواسطة QueryRefiner
evaluate_system(queries_with_expansion, "النظام مع توسعة الاستعلام بواسطة QueryRefiner")


Collecting nltk==3.8.1
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting anyascii (from textsearch>=0.0.21->con

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
استخراج المستندات للنظام: النظام مع توسعة الاستعلام بواسطة QueryRefiner: 100%|██████████| 5000/5000 [26:49<00:00,  3.11it/s]



== تقييم النظام: النظام مع توسعة الاستعلام بواسطة QueryRefiner ==
MAP: 47.63 %
MRR: 50.68 %
Mean Precision: 8.65 %
Mean Recall: 65.93 %
Mean Precision@10: 8.55 %


In [None]:
# ===== تثبيت الحزم المطلوبة =====
!pip install nltk==3.8.1 contractions scikit-learn tqdm fuzzywuzzy[speedup] symspellpy marisa-trie
!pip install country_converter --upgrade
!pip install datefinder

# ===== استيراد المكتبات =====
import os, re, string, json, joblib
from collections import defaultdict, Counter
import numpy as np, pandas as pd
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import nltk
import contractions
from fuzzywuzzy import process, fuzz
from symspellpy import SymSpell
import marisa_trie
from functools import lru_cache

# ===== تحميل موارد NLTK =====
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# ===== إعداد المسارات =====
DATASET_NAME = "quora"
BASE_PATH = f"/content/drive/MyDrive/dataset_quora_dev/"
DOCS_PATH = os.path.join(BASE_PATH, "docs.tsv")
QUERIES_PATH = os.path.join(BASE_PATH, "queries.tsv")
QRELS_PATH = os.path.join(BASE_PATH, "qrels.tsv")
INVERTED_PATH = f"/content/drive/MyDrive/utils/inverted_index/{DATASET_NAME}_index_beir.json"
CLEAN_QUERIES = f"/content/drive/MyDrive/utils/queries_tokens/{DATASET_NAME}_queries_tokens_beir.tsv"
CLEANED_TSV_PATH = f"/content/drive/MyDrive/utils/clean_docs/{DATASET_NAME}_cleaned_docs_beir.tsv"
tfidf_model_path = f"/content/drive/MyDrive/tfidf_models/{DATASET_NAME}_tfidf_beir.joblib"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def expand_contractions(text):
    return contractions.fix(text)

def processing(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = expand_contractions(text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\b\d{1,2}\b", "", text)
    text = re.sub(r"\b\d{5,}\b", "", text)
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    tokens_pos = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tokens_pos
        if word not in stop_words and len(word) > 1
    ]
    return lemmatized

def clean_text(text):
    preprocessed = processing(text)
    tokenized = tokenize(preprocessed)
    return tokenized

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_dataset(path):
    df = pd.read_csv(path, sep="\t", names=["doc_id", "text"], header=None, skiprows=1)
    df["text"] = df["text"].fillna("")
    return df

df = load_dataset(DOCS_PATH)

def load_queries_tokens_from_tsv(path):
    tokens_dict = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tokens_dict[str(parts[0])] = parts[1].split()
    return tokens_dict

queries_tokens = load_queries_tokens_from_tsv(CLEAN_QUERIES)

def retrieve_docs(query_tokens, index):
    docs = set()
    for token in query_tokens:
        docs |= set(index.get(token, []))
    return list(docs)

def load_inverted_index(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

inverted_index = load_inverted_index(INVERTED_PATH)

tqdm.pandas(desc=">> تجهيز النصوص")

if os.path.exists(tfidf_model_path):
    tfidf_data = joblib.load(tfidf_model_path)
    vectorizer, X, tfidf_doc_ids = tfidf_data["vectorizer"], tfidf_data["vectors"], tfidf_data["doc_ids"]
else:
    vectorizer = TfidfVectorizer(
        preprocessor=processing,
        tokenizer=tokenize,
        lowercase=False,
        token_pattern=None
    )
    X = vectorizer.fit_transform(df["text"])
    tfidf_doc_ids = df["doc_id"].astype(str).tolist()
    os.makedirs(os.path.dirname(tfidf_model_path), exist_ok=True)
    joblib.dump({"vectorizer": vectorizer, "vectors": X, "doc_ids": tfidf_doc_ids}, tfidf_model_path)

loaded_model = joblib.load(tfidf_model_path)
vectorizer = loaded_model["vectorizer"]
X = loaded_model["vectors"]
tfidf_doc_ids = loaded_model["doc_ids"]

def represent_query(tokens):
    return vectorizer.transform([" ".join(tokens)])

def rank_docs(query_vec, retrieved_ids, doc_vectors, doc_ids, top_k=10):
    id_to_idx = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    indices = [id_to_idx[doc_id] for doc_id in retrieved_ids if doc_id in id_to_idx]
    if not indices:
        return []
    subset_vectors = doc_vectors[indices]
    sims = cosine_similarity(query_vec, subset_vectors).flatten()
    ranked_idx = np.argsort(-sims)[:top_k]
    return [(retrieved_ids[i], sims[i]) for i in ranked_idx]

# ===== دمج QueryRefiner مع توسعة الاستعلام =====

# قراءة المستندات المعالجة
processed_docs_df = pd.read_csv(CLEANED_TSV_PATH, sep="\t")

# استخراج جميع المصطلحات من العمود processed_text
all_processed_terms = [
    term
    for doc in processed_docs_df["processed_text"]
    for term in str(doc).split()
]

term_frequencies = Counter(all_processed_terms)
processed_terms_set = set(term_frequencies)

# ===== كلاس QueryRefiner الجديد =====
class QueryRefiner:
    def __init__(self, processed_terms):
        self.processed_terms = set(processed_terms)
        self.term_frequencies = Counter(processed_terms)
        self.stop_words = set(stopwords.words("english"))

        # إعداد SymSpell مرة واحدة
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = os.path.join("/content/drive/MyDrive/utils", "symspell_data", "frequency_dictionary_en_82_765.txt")
        if not self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
            raise FileNotFoundError("فشل تحميل القاموس")

        # إدخال كلمات إضافية عند الحاجة
        self.sym_spell.create_dictionary_entry("change", 5000)
        self.sym_spell.create_dictionary_entry("environment", 5000)
        self.sym_spell.create_dictionary_entry("surroundings", 3000)

        # إعداد Trie للإكمال التلقائي السريع
        self.trie = marisa_trie.Trie(self.processed_terms)

    def reduce_repeated_letters(self, word):
        return re.sub(r'(.)\1{2,}', r'\1', word)

    def correct_spelling(self, query: str) -> str:
        query = self.reduce_repeated_letters(query.lower())
        suggestions = self.sym_spell.lookup_compound(query, max_edit_distance=2)
        return suggestions[0].term if suggestions else query

    def suggest_correction(self, query: str) -> str:
        corrected = self.correct_spelling(query)
        return corrected if corrected.lower() != query.lower() else None

    @lru_cache(maxsize=10000)
    def get_synonyms(self, word, max_synonyms=10):
        synonyms = set()
        word = word.lower()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                name = lemma.name().lower()
                if name != word and len(name) > 2 and name.isalpha():
                    synonyms.add(name.replace("_", " "))
                    if len(synonyms) >= max_synonyms:
                        return list(synonyms)
        return list(synonyms)

    def expand_query(self, query):
        corrected = self.correct_spelling(query)
        words = corrected.lower().split()
        expanded_terms = set()

        for word in words:
            if word in self.stop_words:
                continue
            expanded_terms.add(word)
            expanded_terms.update(self.get_synonyms(word))

        return " ".join(expanded_terms) if expanded_terms else query

# إنشاء كائن QueryRefiner
query_refiner = QueryRefiner(all_processed_terms)

def process_queries_with_expansion_refiner(queries_tokens):
    new_queries = {}
    for qid, tokens in queries_tokens.items():
        original_query = " ".join(tokens)
        expanded_query = query_refiner.expand_query(original_query)
        new_queries[qid] = expanded_query.split()
    return new_queries

# ===== مثال لاستخدام التوسعة =====
expanded_queries_tokens = process_queries_with_expansion_refiner(queries_tokens)

# ===== استدعاء استرجاع وترتيب المستندات مع الاستعلام الموسع =====
for qid, tokens in list(expanded_queries_tokens.items())[:3]:  # على سبيل المثال 3 استعلامات
    retrieved = retrieve_docs(tokens, inverted_index)
    query_vec = represent_query(tokens)
    ranked_docs = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=5)
    print(f"Query ID: {qid}")
    print(f"Expanded Query Tokens: {tokens}")
    print("Top Documents:", ranked_docs)
    print("="*50)


# ===== دوال التقييم =====

def get_qrels(path=QRELS_PATH):
    df = pd.read_csv(path, sep="\t", names=["query_id", "doc_id", "relevance"], header=0)
    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)
    qrel_dict = defaultdict(dict)
    relevant_set = defaultdict(set)
    for _, row in df.iterrows():
        qid = str(row["query_id"])
        docid = str(row["doc_id"])
        rel = row["relevance"]
        qrel_dict[qid][docid] = rel
        if rel > 0:
            relevant_set[qid].add(docid)
    return qrel_dict, relevant_set

def get_retrieved_docs_formatted(retrieved):
    return {qid: [doc_id for doc_id, _ in docs] for qid, docs in retrieved.items()}

def calculate_map(qrel, retrieved):
    average_precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        if not relevant_docs:
            continue
        num_relevant = 0
        precision_sum = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                num_relevant += 1
                precision_sum += num_relevant / i
        if num_relevant > 0:
            average_precisions.append(precision_sum / len(relevant_docs))
        else:
            average_precisions.append(0)
    if average_precisions:
        return np.mean(average_precisions) * 100
    return 0

def calculate_mrr(qrel, retrieved):
    rr_list = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        rr = 0
        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs:
                rr = 1 / i
                break
        rr_list.append(rr)
    if rr_list:
        return np.mean(rr_list) * 100
    return 0

def calculate_mean_precision(qrel, retrieved):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        num_retrieved = len(retrieved_docs)
        if num_retrieved == 0:
            precisions.append(0)
        else:
            precisions.append(num_relevant / num_retrieved)
    if precisions:
        return np.mean(precisions) * 100
    return 0

def calculate_mean_recall(qrel, real_relevant, retrieved):
    recalls = []
    for qid in qrel:
        relevant_docs = real_relevant.get(qid, set())
        retrieved_docs = retrieved.get(qid, [])
        if len(relevant_docs) == 0:
            recalls.append(0)
            continue
        retrieved_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        recalls.append(retrieved_relevant / len(relevant_docs))
    if recalls:
        return np.mean(recalls) * 100, recalls
    return 0, []

def calculate_precision_at_k(qrel, retrieved, k=10):
    precisions = []
    for qid in qrel:
        relevant_docs = {docid for docid, rel in qrel[qid].items() if rel > 0}
        retrieved_docs = retrieved.get(qid, [])[:k]
        num_relevant = sum(1 for d in retrieved_docs if d in relevant_docs)
        precisions.append(num_relevant / k)
    if precisions:
        return np.mean(precisions) * 100, precisions
    return 0, []

# ===== دالة تقييم النظام مع توسعة الاستعلام =====

def evaluate_system(queries_tokens, system_name="With QueryRefiner Expansion"):
    retrieved_docs_dict = {}
    for qid, tokens in tqdm(queries_tokens.items(), desc=f"استخراج المستندات للنظام: {system_name}"):
        retrieved = retrieve_docs(tokens, inverted_index)
        query_vec = represent_query(tokens)
        ranked = rank_docs(query_vec, retrieved, X, tfidf_doc_ids, top_k=10)
        retrieved_docs_dict[qid] = ranked

    qrel_dict, real_relevant = get_qrels()
    retrieved_docs_str = get_retrieved_docs_formatted(retrieved_docs_dict)

    print(f"\n== تقييم النظام: {system_name} ==")
    print("MAP:", round(calculate_map(qrel_dict, retrieved_docs_str), 2), "%")
    print("MRR:", round(calculate_mrr(qrel_dict, retrieved_docs_str), 2), "%")
    print("Mean Precision:", round(calculate_mean_precision(qrel_dict, retrieved_docs_str), 2), "%")
    mean_recall, _ = calculate_mean_recall(qrel_dict, real_relevant, retrieved_docs_str)
    print("Mean Recall:", round(mean_recall, 2), "%")
    mean_precision_at_k, _ = calculate_precision_at_k(qrel_dict, retrieved_docs_str)
    print("Mean Precision@10:", round(mean_precision_at_k, 2), "%")

# ===== تشغيل التقييم =====
evaluate_system(expanded_queries_tokens, "النظام مع توسعة الاستعلام بواسطة QueryRefiner")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Query ID: query_id
Expanded Query Tokens: ['schoolbook', 'text', 'textbook']
Top Documents: [('152620', np.float64(0.6575043527286554)), ('499049', np.float64(0.626028443010714)), ('12220', np.float64(0.6076405510167578)), ('10830', np.float64(0.6032707625876371)), ('499048', np.float64(0.5939698364225973))]
Query ID: 318
Expanded Query Tokens: ['flavor', 'aspect', 'moderator', 'expression', 'look', 'face', 'flavour', 'feel', 'quota', 'feeling', 'tone', 'looking', 'spirit']
Top Documents: [('78199', np.float64(0.372801521466937)), ('374620', np.float64(0.3650840664733963)), ('374619', np.float64(0.3650840664733963)), ('130782', np.float64(0.36361559301751706)), ('235521', np.float64(0.358664252208077))]
Query ID: 378
Expanded Query Tokens: ['sprightliness', 'spirit', 'choose', 'animation', 'biography', 'affair', 'defy', 'lifetime', 'refuse', 'deny', 'prefer', 'matter', 'life', 'living', 'select', 'liveliness', 'scraps', 'opt', 'reject', 'garbage', 'take', 'unlike', 'dissimilar', 'lifes

استخراج المستندات للنظام: النظام مع توسعة الاستعلام بواسطة QueryRefiner: 100%|██████████| 5000/5000 [22:13<00:00,  3.75it/s]



== تقييم النظام: النظام مع توسعة الاستعلام بواسطة QueryRefiner ==
MAP: 40.28 %
MRR: 43.12 %
Mean Precision: 7.56 %
Mean Recall: 58.08 %
Mean Precision@10: 7.53 %
