In [None]:
!pip install datasets rank_bm25 scikit-learn tqdm

In [None]:
import os
import numpy as np
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
# Step 1: Download the dataset
DATASET = "BeIR/quora"
corpus = load_dataset(DATASET, "corpus")["corpus"]
queries = load_dataset(DATASET, "queries")["queries"]

In [None]:
load_dataset(DATASET, "qrels")

In [None]:
# Step 3: Tokenize the corpus
tokenized_corpus = [doc["text"].split(" ") for doc in corpus]

# Step 4: Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

# Step 5: Function to retrieve top N documents for a query
def retrieve_top_n(query, n=5):
    tokenized_query = query.split(" ")
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = np.argsort(scores)[::-1][:n]
    return top_n_indices

In [None]:
# download ground truth ordering for BeIR quora DATASET

ground = 

In [None]:
top_docs_indices, len(corpus)

In [None]:
# Step 6: Benchmark BM25 on the test set
results = []

for query in tqdm(queries):
    query = query["text"]
    top_docs_indices = retrieve_top_n(query)
    top_docs = [corpus[int(idx)] for idx in top_docs_indices]
    results.append((query, top_docs))

# Print some example results
for i in range(5):
    print(f"Query: {results[i][0]}")
    print("Top documents:")
    for doc in results[i][1]:
        print(f"- {doc}")
    print()

# Optional: Save results to a file
output_file = "bm25_quora_results.txt"
with open(output_file, 'w') as f:
    for query, top_docs in results:
        f.write(f"Query: {query}\n")
        f.write("Top documents:\n")
        for doc in top_docs:
            f.write(f"- {doc}\n")
        f.write("\n")
print(f"Results saved to {output_file}")


In [None]:
!pip install tantivy

In [None]:
import os
import json
from tqdm import tqdm
import tantivy
from typing import Iterable, List
import shutil


DATASET = os.getenv("DATASET", "quora")

def read_file(file_name: str) -> Iterable[str]:
    with open(file_name, "r") as file:
        for line in file:
            row = json.loads(line)
            yield row["_id"], row["text"]


def main():

    file_name = f"data/{DATASET}/corpus.jsonl"  # DATASET collection
    file_out = f"data/{DATASET}/bm25.tantivy"  # output file

    if os.path.exists(file_out):
        # remove direcotry recursively
        shutil.rmtree(file_out)

    if not os.path.exists(file_out):
        os.makedirs(file_out, exist_ok=True)

    # Declaring our schema.
    schema_builder = tantivy.SchemaBuilder()
    schema_builder.add_text_field("body", stored=True, tokenizer_name="en_stem")
    schema_builder.add_text_field("doc_id", stored=True)
    schema = schema_builder.build()

    # Creating our index (in memory)
    index = tantivy.Index(schema, path=file_out)

    writer = index.writer()

    for idx, (doc_id, doc_text) in tqdm(enumerate(read_file(file_name))):
        doc = tantivy.Document(
            doc_id=doc_id,
            body=doc_text
        )
        writer.add_document(doc)

    writer.commit()
    print("indexed")

if __name__ == '__main__':
    main()

In [None]:
import tantivy
import re
import json
import os

DATASET = "quora"


def load_queries():
    queries = {}

    with open(f"data/{DATASET}/queries.jsonl", "r") as file:
        for line in file:
            row = json.loads(line)
            queries[row["_id"]] = { **row, "doc_ids": [] }

    with open(f"data/{DATASET}/qrels/test.tsv", "r") as file:
        next(file)
        for line in file:
            query_id, doc_id, score = line.strip().split("\t")
            if int(score) > 0:
                queries[query_id]["doc_ids"].append(doc_id)

    queries_filtered = {}
    for query_id, query in queries.items():
        if len(query["doc_ids"]) > 0:
            queries_filtered[query_id] = query

    return queries_filtered


def sanitize_query_for_tantivy(query):
    # escape special characters
    query = re.sub(r'([+\-!(){}\[\]^"~*?:\\<])', r' ', query)
    return query


def main():
    schema_builder = tantivy.SchemaBuilder()
    schema_builder.add_text_field("body", stored=True, tokenizer_name="en_stem")
    schema_builder.add_text_field("doc_id", stored=True)
    schema = schema_builder.build()
    index = tantivy.Index(schema, path=f"data/{DATASET}/bm25.tantivy/")

    searcher = index.searcher()

    def search_bm25(query, limit):
        query = index.parse_query(sanitize_query_for_tantivy(query), ['body'])
        hits = searcher.search(query, limit).hits
        docs = [
            searcher.doc(doc_address)
            for (score, doc_address) in hits
        ]
        return docs

    n = 0
    hits = 0
    limit = 10
    number_of_queries = 100_000

    queries = load_queries()

    num_queries = 0
    num_responses = 0

    recalls = []
    precisions = []

    for idx, query in enumerate(queries.values()):
        if idx >= number_of_queries:
            break

        num_queries += 1

        result = search_bm25(query["text"], limit)
        num_responses += len(result)

        found_ids = []

        for hit in result:
            found_ids.append(hit["doc_id"][0])

        query_hits = 0
        for doc_id in query["doc_ids"]:
            n += 1
            if doc_id in found_ids:
                hits += 1
                query_hits += 1

        recalls.append(
            query_hits / len(query["doc_ids"])
        )
        precisions.append(
            query_hits / limit
        )

        print(f"Processing query: {query}, hits: {query_hits}")

    print(f"Total hits: {hits} out of {n}, which is {hits/n}")

    print(f"Precision: {hits/(num_queries * limit)}")

    average_precision = sum(precisions) / len(precisions)

    print(f"Average precision: {average_precision}")

    average_recall = sum(recalls) / len(recalls)

    print(f"Average recall: {average_recall}")


if __name__ == "__main__":
    main()