In [2]:
import bm25s
import Stemmer  # optional: for stemming

# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "a fish is a creature that lives in water and swims",
]

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

# Query the corpus
query = "does the fish purr like a cat?"
query_tokens = bm25s.tokenize(query, stemmer=stemmer)

# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=2)

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}")

# You can save the arrays to a directory...
retriever.save("animal_index_bm25")

# You can save the corpus along with the model
retriever.save("animal_index_bm25", corpus=corpus)

# ...and load them when you need them
import bm25s
reloaded_retriever = bm25s.BM25.load("animal_index_bm25", load_corpus=True)
# set load_corpus=False if you don't need the corpus

Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Rank 1 (score: 1.59): a cat is a feline and likes to purr
Rank 2 (score: 0.48): a fish is a creature that lives in water and swims


Finding newlines for mmindex:   0%|          | 0.00/256 [00:00<?, ?B/s]

In [3]:
# You can provide a list of queries instead of a single query
queries = ["What is a cat?", "is the bird a dog?"]

# Provide your own stopwords list if you don't like the default one
stopwords = ["a", "the"]

# For stemming, use any function that is callable on each word list
stemmer_fn = lambda lst: [word for word in lst]

# Tokenize the queries
query_token_ids = bm25s.tokenize(queries, stopwords=stopwords, stemmer=stemmer_fn)

# If you want the tokenizer to return strings instead of token ids, you can do this
query_token_strs = bm25s.tokenize(queries, return_ids=False)

# You can use a different corpus for retrieval, e.g., titles instead of full docs
titles = ["About Cat", "About Dog", "About Bird", "About Fish"]

# You can also choose to only return the documents and omit the scores
results = retriever.retrieve(query_token_ids, corpus=titles, k=2, return_as="documents")

# The documents are returned as a numpy array of shape (n_queries, k)
for i in range(results.shape[1]):
    print(f"Rank {i+1}: {results[0, i]}")

Split strings:   0%|          | 0/2 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/2 [00:00<?, ?it/s]

Split strings:   0%|          | 0/2 [00:00<?, ?it/s]

Reconstructing token strings:   0%|          | 0/2 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/2 [00:00<?, ?it/s]

Rank 1: About Cat
Rank 2: About Dog


In [None]:
# Create a BM25 index
# ...

# let's say you have a large corpus
corpus = [
    "a very long document that is very long and has many words",
    "another long document that is long and has many words",
    # ...
]
# Save the BM25 index to a file
retriever.save("bm25s_very_big_index", corpus=corpus)

# Load the BM25 index as a memory-mapped file, which is memory efficient
# and reduce overhead of loading the full index into memory
retriever = bm25s.BM25.load("bm25s_very_big_index", mmap=True)

In [4]:
"""
Sometimes, you might want to have a corpus consisting of dict rather than pure text.

dicts, and any json-serializable object, is supported by bm25s. This example shows you how to pass a list of dict.

Note: If the elements in your corpus is not json serializable, it will not be properly saved. In those cases, you 
should avoid passing 
"""
import bm25s

# Create your corpus here

corpus_json = [
    {"text": "a cat is a feline and likes to purr", "metadata": {"source": "internet"}},
    {"text": "a dog is the human's best friend and loves to play", "metadata": {"source": "encyclopedia"}},
    {"text": "a bird is a beautiful animal that can fly", "metadata": {"source": "cnn"}},
    {"text": "a fish is a creature that lives in waiter and swims", "metadata": {"source": "i made it up"}},
]
corpus_text = [doc["text"] for doc in corpus_json]


# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")

# Create the BM25 retriever and attach your corpus_json to it
retriever = bm25s.BM25(corpus=corpus_json)
# Now, index the corpus_tokens (the corpus_json is not used yet)
retriever.index(corpus_tokens)

# Query the corpus
query = "does the fish purr like a cat?"
query_tokens = bm25s.tokenize(query)

# Get top-k results as a tuple of (doc, scores). Note that results
# will correspond to the corpus item at the corresponding index
# (you are responsible to make sure each element in corpus_json
# corresponds to each element in your tokenized corpus)
results, scores = retriever.retrieve(query_tokens, k=2)

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}")

# You can save the arrays to a directory...
# Note that this will fail if your corpus passed to `BM25(corpus...)` is not serializable
retriever.save("animal_index_bm25")

# ...and load them when you need them
import bm25s
reloaded_retriever = bm25s.BM25.load("animal_index_bm25", load_corpus=True)
# set load_corpus=False if you don't need the corp

Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Rank 1 (score: 1.06): {'text': 'a cat is a feline and likes to purr', 'metadata': {'source': 'internet'}}
Rank 2 (score: 0.48): {'text': 'a fish is a creature that lives in waiter and swims', 'metadata': {'source': 'i made it up'}}


Finding newlines for mmindex:   0%|          | 0.00/364 [00:00<?, ?B/s]

In [5]:
"""
# Example: Indexing Natural Questions

This shows how to build an index of the natural questions dataset using BM25S.

To run this example, you need to install the following dependencies:

```bash
pip install beir bm25s PyStemmer
```

Then, run with:

```bash
python examples/index_nq.py
```
"""
import beir.util
from beir.datasets.data_loader import GenericDataLoader
import Stemmer  # from PyStemmer

import bm25s
from bm25s.utils.beir import BASE_URL


def main(save_dir="datasets", index_dir="bm25s_indices/nq", dataset="nq"):
    data_path = beir.util.download_and_unzip(BASE_URL.format(dataset), save_dir)
    corpus, _, __ = GenericDataLoader(data_folder=data_path).load(split="test")
    corpus_records = [
        {'id': k, 'title': v["title"], 'text': v["text"]} for k, v in corpus.items()
    ]
    corpus_lst = [r["title"] + " " + r["text"] for r in corpus_records]

    stemmer = Stemmer.Stemmer("english")
    corpus_tokenized = bm25s.tokenize(corpus_lst, stemmer=stemmer)

    retriever = bm25s.BM25(corpus=corpus_records)
    retriever.index(corpus_tokenized)
    retriever.save(index_dir)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'beir'