In [51]:
from google.colab import drive
drive.mount('/content/drive')

# 设置项目路径（请确保路径与您自己的目录一致）
import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stopwords.words('english') and word not in string.punctuation]
    return tokens

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
!pip install sentence-transformers
!pip install faiss-cpu
!pip install rank-bm25

import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
import os

def build_index(corpus):
    """
    corpus: 字典 {doc_id: document_text}
    返回：BM25 索引、SBERT 模型、FAISS GPU IVF 索引、文档向量、文档ID列表、文档文本列表
    """
    # 提取文档ID和文本
    doc_ids = list(corpus.keys())
    documents = [corpus[doc_id] for doc_id in doc_ids]

    # BM25 部分（同之前）
    tokenized_corpus = [preprocess(doc) for doc in tqdm(documents, desc="Preprocessing documents")]
    bm25 = BM25Okapi(tokenized_corpus)

    model = SentenceTransformer('all-mpnet-base-v2')
    doc_embeddings = model.encode(documents, convert_to_numpy=True, show_progress_bar=True)

    embeddings_dir = os.path.join(PROJECT_PATH, 'embeddings')
    # os.makedirs(embeddings_dir, exist_ok=True)  # 如果目录不存在则创建
    np.save(os.path.join(embeddings_dir, 'doc_embeddings.npy'), doc_embeddings)

    # 使用 IVF 索引优化 FAISS 检索
    d = doc_embeddings.shape[1]
    cpu_index = faiss.IndexFlatIP(d)  # 使用内积的 CPU 索引
    cpu_index.add(doc_embeddings)

    return bm25, model, cpu_index, doc_embeddings, doc_ids


def normalize_scores(scores):
    # 简单归一化到 [0, 1]
    min_val = np.min(scores)
    max_val = np.max(scores)
    return (scores - min_val) / (max_val - min_val + 1e-8)



In [65]:

def hybrid_retrieval(query, bm25, model, gpu_index, doc_embeddings, doc_ids, top_k=10, alpha=0.5):
    """
    采用 BM25 先召回候选集，再在候选集上使用 SBERT 进行精排。
    """
    # BM25 得分
    query_tokens = preprocess(query)
    bm25_scores = bm25.get_scores(query_tokens)
    # 选择 BM25 得分最高的候选集，如 top 500
    candidate_indices = np.argsort(bm25_scores)[::-1][:500]

    # 将候选集的向量提取出来
    candidate_embeddings = doc_embeddings[candidate_indices]

    # 对查询进行 SBERT 编码
    query_embedding = model.encode([query], convert_to_numpy=True)
    # 在候选集上计算内积相似度
    # 这里可以直接用 FAISS GPU 索引进行搜索，但更合适的是计算候选集相似度：
    sbert_scores = np.dot(candidate_embeddings, query_embedding[0])

    # BM25 scores for candidates
    bm25_candidates = bm25_scores[candidate_indices]
    # 归一化
    bm25_norm = normalize_scores(bm25_candidates)
    sbert_norm = normalize_scores(sbert_scores)

    # 线性加权融合
    final_scores_candidates = alpha * bm25_norm + (1 - alpha) * sbert_norm
    # 排序候选集
    ranked_candidate_indices = np.argsort(final_scores_candidates)[::-1][:top_k]
    final_doc_indices = candidate_indices[ranked_candidate_indices]

    ranked_doc_ids = [doc_ids[i] for i in final_doc_indices]
    ranked_scores = final_scores_candidates[ranked_candidate_indices]

    return ranked_doc_ids, ranked_scores

In [50]:
!pip install datasets
!pip install pytrec_eval

from datasets import load_dataset
import pytrec_eval
import random

def load_msmarco_hf():
    # 加载 MSMARCO 数据集，假设使用的是验证集（validation split）
    dataset = load_dataset("ms_marco", "v1.1")
    dev_data = dataset["validation"]
    dev_data = dataset["validation"].shuffle(seed=42).select(range(1000))

    queries = {}
    corpus = {}
    qrels = {}

    # 遍历每个样本，每个样本代表一个查询及其候选 passages
    for example in dev_data:
        # 获取 query_id 和 query 文本
        qid = str(example["query_id"])
        query_text = example["query"]
        queries[qid] = query_text

        # 获取 passages 中的信息
        passages_info = example["passages"]
        passage_texts = passages_info.get("passage_text", [])
        is_selecteds = passages_info.get("is_selected", [])

        # 对每个 passage，生成一个唯一的 doc id，并加入 corpus
        for i, (text, is_sel) in enumerate(zip(passage_texts, is_selecteds)):
            # 生成唯一的文档ID，例如 "queryid_i"
            doc_id = f"{qid}_{i}"
            corpus[doc_id] = text
            # 如果该 passage 被标记为相关，则加入 qrels
            if is_sel == 1:
                if qid not in qrels:
                    qrels[qid] = {}
                qrels[qid][doc_id] = 1

    return corpus, queries, qrels

corpus, queries, qrels = load_msmarco_hf()
print(f"Loaded {len(corpus)} documents, {len(queries)} queries, {len(qrels)} qrels.")



validation-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

Loaded 8232 documents, 1000 queries, 971 qrels.


In [55]:
doc_ids = list(corpus.keys())
documents = [corpus[doc_id] for doc_id in doc_ids]

# 定义一个简单的预处理函数（根据需要可以自行扩展）
def preprocess(text):
    return text.lower().split()


In [58]:
bm25, model, faiss_index, doc_embeddings, doc_ids = build_index(corpus)
run = {}
for qid, query in queries.items():
    ranked_doc_ids, scores = hybrid_retrieval(query, bm25, model, faiss_index, doc_embeddings, doc_ids, top_k=50, alpha=0.5)
    run[str(qid)] = {str(docid): float(score) for docid, score in zip(ranked_doc_ids, scores)}

Preprocessing documents: 100%|██████████| 8232/8232 [00:00<00:00, 113497.07it/s]


Batches:   0%|          | 0/258 [00:00<?, ?it/s]

In [64]:
# 5. 使用 pytrec_eval 进行评估
# pytrec_eval 要求 qrels 和 run 的格式均为：{query_id: {doc_id: relevance or score}}
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'recip_rank', 'ndcg', 'map', 'recall'})
results = evaluator.evaluate(run)

# 计算各指标的平均值
avg_metrics = {}
for metric in ['recip_rank', 'ndcg', 'map', 'recall']:
    avg_metrics[metric] = sum(d.get(metric, 0) for d in results.values()) / len(results)

print("评估结果：")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

评估结果：
recip_rank: 0.4716
ndcg: 0.5809
map: 0.4650
recall: 0.0000
