In [None]:
import bm25s
import Stemmer  # optional: for stemming

In [None]:
# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "a fish is a creature that lives in water and swims",
]

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)


In [None]:
corpus_tokens

In [None]:
# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)


In [None]:
corpus_tokens

In [None]:
retriever.

In [None]:
# Query the corpus
query = "Does the FISH purr like a cat?"
query_tokens = bm25s.tokenize(query, stemmer=stemmer)


In [None]:
query_tokens

In [None]:
# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=2)

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}")

In [None]:
# You can save the arrays to a directory...
retriever.save("animal_index_bm25")

In [None]:
# You can save the corpus along with the model
retriever.save("animal_index_bm25_model", corpus=corpus)



In [None]:
# ...and load them when you need them
import bm25s
reloaded_retriever = bm25s.BM25.load("animal_index_bm25", load_corpus=True)
# set load_corpus=False if you don't need the corpus

In [None]:
# Query the corpus
query2 = "Why is my bird so noisy?"
query2_tokens = bm25s.tokenize(query2, stemmer=stemmer)

In [None]:
# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results2, scores2 = reloaded_retriever.retrieve(query2_tokens, corpus=corpus, k=2)

for i in range(results2.shape[1]):
    doc, score = results2[0, i], scores2[0, i]
    print(f"Rank {i+1} (score: {score:.2f}): {doc}")

-----

In [1]:
# load
import bm25s
import Stemmer 
stemmer = Stemmer.Stemmer("english")

In [7]:
# 1. Index the vault 

# specify the vault name to separate the indexes
vault_name = "testvault"

# get vault documents via API from user as a list  [vault_name, vault_documents (dict of file_name and file_content)]
# convert dict element vault_documents to two lists: file_names and file_contents

# this is temporary to read from local directory
import os
def load_files_from_directory(directory):
    file_names = []
    file_contents = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    file_names.append(file)
                    file_contents.append(f.read())
    return file_names, file_contents

file_names, file_contents = load_files_from_directory('./data/sample')

In [8]:
# remove special characters
import re

character_patterns = {
    "bold": r"\*\*(.*?)\*\*|__(.*?)__",  # Removes ** and __ while keeping the text
    "italic": r"\*(.*?)\*|_(.*?)_",  # Removes * and _ while keeping the text
    "inline_code": r"`(.*?)`",  # Removes ` while keeping the inline code text
    "links": r"$begin:math:display$(.*?)$end:math:display$$begin:math:text$.*?$end:math:text$",  # Removes the link notation, keeping the link text only
    "images": r"!$begin:math:display$.*?$end:math:display$$begin:math:text$.*?$end:math:text$",  # Removes the entire image markdown, as it typically doesn't have useful visible text
    "headings": r"^#+\s*(.*?)$",  # Removes the # characters while keeping the heading text
    "blockquotes": r"^>\s*(.*?)$",  # Removes the > character while keeping the quoted text
    "code_blocks": r"```(?:.|\n)*?```",  # Removes fenced code block notation while keeping the code
    "list_items": r"^[-*]\s+",  # Removes list markers (- or *) while keeping the list item text
    "extra_newlines": r"\n{2,}",  # Collapses multiple newlines into one
    "unicode escapes": r"(\\u[0-9a-fA-F]{4})+", 
    "outbound links": r"https?://\S+",
    "image linkes": r"!\[\[.*?\]\]|\!\[.*?\]\(.*?\)"
}

for key, value in character_patterns.items():
    file_contents = [re.sub(value, '', file) for file in file_contents]


In [10]:
# Tokenize the corpus and only keep the ids
corpus_tokens = bm25s.tokenize(file_contents, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

# Save model and file names 
retriever.save(f"./vault_indexes/{vault_name}_index/", corpus=file_names)

# send message to user that indexing is complete 

Split strings:   0%|          | 0/375 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/375 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/375 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/375 [00:00<?, ?it/s]

Finding newlines for mmindex:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

In [14]:
# 2. Query the vault index

# get query via API from user as a list [vault_name (string), query (string)]

# this is temporary 
vault_name = "testvault"
query = "software development"

# tokenize query
query_tokens = bm25s.tokenize(query, stemmer=stemmer)

# load the model 
retriever = bm25s.BM25.load(f"./vault_indexes/{vault_name}_index/", load_corpus=True)

# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results, scores = retriever.retrieve(query_tokens, k=10)

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Stem Tokens:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
# return object with file names and scores in order
final_results = []

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
    final_results.append({'doc':doc['text'], 'score':round(float(score), 2)})

In [16]:
final_results

[{'doc': 'LLM Observability & Evaluation.md', 'score': 2.65},
 {'doc': 'Beyond the Imitation Game (BIG-Bench).md', 'score': 2.33},
 {'doc': 'Agile + Scrum.md', 'score': 2.32},
 {'doc': 'Some High-Tech Career Counseling Tips.md', 'score': 1.91},
 {'doc': 'SuperGLUE.md', 'score': 1.88},
 {'doc': 'CUNY DATA 607 Week 7 Assignment.md', 'score': 1.85},
 {'doc': 'Challenges in Evaluating AI systems.md', 'score': 1.51},
 {'doc': 'AI Index Report 2024 â€“ Artificial Intelligence Index.md',
  'score': 1.43},
 {'doc': 'Assumptions.md', 'score': 1.39},
 {'doc': 'W3 - Growth Models.md', 'score': 1.39}]