In [1]:
import os
import json
import logging
from pathlib import Path
from rank_bm25 import BM25Okapi
from beir.datasets.data_loader import GenericDataLoader


  from tqdm.autonotebook import tqdm


In [2]:
project_root = Path.cwd().parent
data_path = project_root / "datasets" / "scifact"
output_dir = project_root / "outputs"
output_dir.mkdir(exist_ok=True)

In [4]:
print("Loading dataset (SciFact)...") # load test split
corpus, queries, qrels = GenericDataLoader(data_folder=str(data_path)).load(split="test")

corpus_ids = list(corpus.keys())

tokenized_corpus = [
    (corpus[doc_id].get("title", "") + " " + corpus[doc_id].get("text", "")).lower().split()
    for doc_id in corpus_ids
]
bm25 = BM25Okapi(tokenized_corpus)

Loading dataset (SciFact)...


100%|██████████| 5183/5183 [00:00<00:00, 292081.98it/s]


In [5]:
print(f"Retrieving for {len(queries)} queries...")
results = {}

for query_id, query_text in queries.items():
    tokenized_query = query_text.lower().split()
    doc_scores = bm25.get_scores(tokenized_query)
    id_score_pairs = list(zip(corpus_ids, doc_scores))
    top_100 = sorted(id_score_pairs, key=lambda x: x[1], reverse=True)[:100]
    results[query_id] = {doc_id: float(score) for doc_id, score in top_100}

output_file = output_dir / "sparse_results.json"
with open(output_file, "w") as f:
    json.dump(results, f)

print(f"Successfully saved sparse results to {output_file}")

Retrieving for 300 queries...
Successfully saved sparse results to /home/frank_shan/dev/python/sparse-dense-ir-scifact/outputs/sparse_results.json
