In [1]:
import os
import csv
from collections import defaultdict
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import Document



In [6]:
from pathlib import Path

# Define dataset paths
BASE_DIR = Path("./dataset/processed/msmarco-amharic-news_dataset/")
COLLECTION_PATH = BASE_DIR / "collection.tsv"
QUERIES_PATH = BASE_DIR / "queries_train.tsv"
QRELS_PATH = BASE_DIR / "qrels_train.txt"

# Define output path
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
BM25_RANKINGS_PATH = OUTPUT_DIR / "bm25_rankings.tsv"


In [7]:
### Step 1: Load Collection (doc_id → text)
corpus = {}
with open(COLLECTION_PATH, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) < 2:
            continue
        doc_id, doc_text = row[:2]
        corpus[doc_id] = doc_text  # Keep doc_id as string if necessary

print(f"Loaded {len(corpus)} documents.")

Loaded 49780 documents.


In [8]:
### Step 2: Load Queries (qid → text)
queries = {}
with open(QUERIES_PATH, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) < 2:
            continue
        qid, query_text = row[:2]
        queries[qid] = query_text  # Keep qid as string if necessary

print(f" Loaded {len(queries)} queries.")

 Loaded 39729 queries.


In [9]:
### Step 3: Convert Collection to Documents (for BM25)
documents = [
    Document(text=corpus[doc_id], doc_id=doc_id) for doc_id in corpus
]

###  Step 4: Train BM25 Retriever
bm25_retriever = BM25Retriever.from_defaults(
    nodes=documents, 
    similarity_top_k=100
)

print(f"BM25 Retriever initialized.")

BM25 Retriever initialized.


In [11]:
### Step 5: Retrieve Top 100 & Save Results in MSMARCO Format (No Header)
with open(BM25_RANKINGS_PATH, "w", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")

    for qid, query in queries.items():
        results = bm25_retriever.retrieve(query)  # Retrieve top 100
        
        for rank, node in enumerate(results[:100], start=1):  
            writer.writerow([qid, node.node_id, rank, node.score])  # Save without header
#["qid", "doc_id", "rank", "score"]
print(f"Saved top 100 BM25 retrieval results to {BM25_RANKINGS_PATH}")

Saved top 100 BM25 retrieval results to outputs/bm25_rankings.tsv
