In [11]:
import importlib
import Utils
importlib.reload(Utils)  # This will print "Utils.py is loaded" again

import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

import bm25s
import Stemmer  # optional: for stemming

Utils.py is loaded
resource module not available on Windows


In [2]:
client = chromadb.PersistentClient(
    path="db",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [3]:
import chromadb.utils.embedding_functions as embedding_functions

# Can use SentenceTransformer to get embeddings for sentences
# https://docs.trychroma.com/docs/embeddings/embedding-functions
# 3 as of 3/7/2025: "intfloat/multilingual-e5-large-instruct", but only 514 Max tokens!!!!
# https://huggingface.co/spaces/mteb/leaderboard
# https://huggingface.co/intfloat/multilingual-e5-large-instruct

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="intfloat/multilingual-e5-large-instruct"
)

In [4]:
collection_name = "Pinkbike_articles"
collection = client.get_or_create_collection(name=collection_name,embedding_function=sentence_transformer_ef)

In [10]:
corpus = collection.get()['documents']

In [12]:
# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

Split strings:   0%|          | 0/55 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/55 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/55 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/55 [00:00<?, ?it/s]

In [21]:
# Query the corpus
query = "2025 predictions for mountain biking"
query_tokens = bm25s.tokenize(query, stemmer=stemmer)

# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k).
# To return docs instead of IDs, set the `corpus=corpus` parameter.
results_bm25s, scores = retriever.retrieve(query_tokens, k=5)

for i in range(results_bm25s.shape[1]):
    doc, score = results_bm25s[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}: {corpus[doc][:50]}")

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Rank 1 (score: 2.11): 24: Dario DiGiulio's 2025 Predictions: Behold, the vis
Rank 2 (score: 1.11): 4: Crankworx Announces Details for 2025 Cairns Festiv
Rank 3 (score: 1.11): 16: Reframing MTB Announces 2025 Bristol Conference Da
Rank 4 (score: 1.08): 51: Crankworx Cairns Announces New Events for 2025: PR
Rank 5 (score: 1.04): 11: Malverns Classic Returns in 2025 With New Sponsors


In [22]:
results_bm25s

array([[24,  4, 16, 51, 11]], dtype=int32)

In [24]:
Utils.generate_sha256_id(corpus[24])


'8a5f5655ccd86e15bd8d63675203425dd5332601ad5ea3aa5ed05043eb00644d'

In [23]:
# https://gist.github.com/srcecde/eec6c5dda268f9a58473e1c14735c7bb

from collections import defaultdict
def reciprocal_rank_fusion(*list_of_list_ranks_system, K=60):
    """
    Fuse rank from multiple IR systems using Reciprocal Rank Fusion.
    
    Args:
    * list_of_list_ranks_system: Ranked results from different IR system.
    K (int): A constant used in the RRF formula (default is 60).
    
    Returns:
    Tuple of list of sorted documents by score and sorted documents
    """
    # Dictionary to store RRF mapping
    rrf_map = defaultdict(float)

    # Calculate RRF score for each result in each list
    for rank_list in list_of_list_ranks_system:
        for rank, item in enumerate(rank_list, 1):
            rrf_map[item] += 1 / (rank + K)

    # Sort items based on their RRF scores in descending order
    sorted_items = sorted(rrf_map.items(), key=lambda x: x[1], reverse=True)

    # Return tuple of list of sorted documents by score and sorted documents
    return sorted_items, [item for item, score in sorted_items]


# Example ranked lists from different sources
ir_system_a = ['Document1', 'Document3', 'Document5', 'Document7']
ir_system_b = ['Document2', 'Document1', 'Document4']
ir_system_c = ['Document5', 'Document3', 'Document2']

# Combine the lists using RRF
combined_list = reciprocal_rank_fusion(ir_system_a, ir_system_b, ir_system_c)
print(combined_list)

([('Document1', 0.03252247488101534), ('Document5', 0.032266458495966696), ('Document2', 0.032266458495966696), ('Document3', 0.03225806451612903), ('Document4', 0.015873015873015872), ('Document7', 0.015625)], ['Document1', 'Document5', 'Document2', 'Document3', 'Document4', 'Document7'])


In [None]:
query = "2025 predictions for mountain biking"
