# Implementing BM25 and Lexical Search

In [None]:
!uv pip install llama-index-embeddings-huggingface

In [None]:
# Sample docs
from llama_index.core import Document

documents = [
    Document(text="Machine learning is a branch of artificial intelligence focused on building systems that learn from data.",
             metadata={"title": "Machine Learning Basics"}),
    Document(text="Transformers are neural network models that use self-attention mechanisms to process sequential data.",
             metadata={"title": "Transformer Architecture"}),
    Document(text="Python code for neural networks typically uses libraries like TensorFlow or PyTorch.",
             metadata={"title": "Neural Network Code"}),
    Document(text="The backpropagation algorithm calculates gradients by applying the chain rule backwards through the network.",
             metadata={"title": "Backpropagation Algorithm"}),
    Document(text="BM25 is a ranking function used in information retrieval systems based on term frequency.",
             metadata={"title": "BM25 Algorithm"}),
    Document(text="Artificial intelligence concepts include reasoning, learning, and adaptation in complex environments.",
             metadata={"title": "AI Concepts"}),
    Document(text="Deep learning is a subset of machine learning that uses multi-layered neural networks to extract complex patterns.",
             metadata={"title": "Deep Learning Introduction"}),
    Document(text="Convolutional Neural Networks (CNNs) are specialized neural architectures designed for image processing and computer vision tasks.",
             metadata={"title": "CNN Architecture"}),
    Document(text="Natural Language Processing (NLP) uses computational techniques to analyze and understand human language text and speech.",
             metadata={"title": "NLP Fundamentals"}),
    Document(text="Reinforcement learning is a training method based on rewarding desired behaviors and punishing undesired ones.",
             metadata={"title": "Reinforcement Learning"}),
    Document(text="Vector databases store high-dimensional vectors for efficient similarity search and retrieval.",
             metadata={"title": "Vector Database Systems"}),
    Document(text="The BERT language model uses bidirectional training to understand context from both directions in text.",
             metadata={"title": "BERT Model"}),
    Document(text="Hybrid retrieval systems combine multiple search techniques like BM25 and vector search for improved results.",
             metadata={"title": "Hybrid Retrieval"})
]

In [13]:
from llama_index.core.schema import QueryBundle

# Helper function
def test_bm25_retrieval(retriever, queries):
    """Test BM25 retriever with a list of queries."""
    for query in queries:
        print(f"\n{'='*80}\nQuery: {query}\n{'='*80}")
        query_bundle = QueryBundle(query_str=query)
        results = retriever.retrieve(query_bundle)

        print(f"Found {len(results)} relevant documents\n")
        for i, result in enumerate(results):
            print(f"Result {i+1} (Score: {result.score:.8f}):")
            print(f"  {result.node.get_content()[:200]}...\n")

In [16]:
# Import necessary modules
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.node_parser import SentenceSplitter

print(f"Loaded {len(documents)} documents")
splitter = SentenceSplitter(chunk_size=200)
nodes = splitter.get_nodes_from_documents(documents)

# Create BM25 Retriever - no embeddings model needed here!
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=3
)

# Run test queries
test_queries = [
    "What is machine learning?",
    "How do transformers work?",
    "Explain the limitations of BM25",
    "Python code examples for neural networks"
]
test_bm25_retrieval(bm25_retriever, test_queries)
# Compare with specific technical terms
technical_queries = [
    "dropout regularization technique",
    "backpropagation algorithm",
    "cross-entropy loss function",
    "BERT pre-training objective"
]
print("\n\nTesting with technical queries:")
test_bm25_retrieval(bm25_retriever, technical_queries)


Loaded 13 documents

Query: What is machine learning?
Found 3 relevant documents

Result 1 (Score: 1.72518969):
  Machine learning is a branch of artificial intelligence focused on building systems that learn from data....

Result 2 (Score: 1.36624670):
  Deep learning is a subset of machine learning that uses multi-layered neural networks to extract complex patterns....

Result 3 (Score: 0.65637398):
  Reinforcement learning is a training method based on rewarding desired behaviors and punishing undesired ones....


Query: How do transformers work?
Found 3 relevant documents

Result 1 (Score: 1.29171598):
  Transformers are neural network models that use self-attention mechanisms to process sequential data....

Result 2 (Score: 0.00000000):
  The BERT language model uses bidirectional training to understand context from both directions in text....

Result 3 (Score: 0.00000000):
  Vector databases store high-dimensional vectors for efficient similarity search and retrieval....


Query:

In [17]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.schema import QueryBundle
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

print(f"Created {len(documents)} sample documents")

# Create retrievers
def create_retrievers(documents):
    # Parse into nodes - We're using 2000 (very large chunk to keep each document as a node)
    parser = SentenceSplitter(chunk_size=2000, chunk_overlap=0)
    nodes = parser.get_nodes_from_documents(documents)
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes, similarity_top_k=5)

    # Vector retriever
    embed_model = HuggingFaceEmbedding(
        model_name="sentence-transformers/all-MiniLM-L6-v2")

    vector_index = VectorStoreIndex(
        nodes,         
        embed_model=embed_model
        )
    vector_retriever = vector_index.as_retriever(similarity_top_k=5)

    return {"BM25": bm25_retriever, "Vector": vector_retriever}

# Compare retrievers
def compare_retrievers(retrievers, queries):
    results = {}

    for name, retriever in retrievers.items():
        method_results = []

        for query in queries:
            print(f"Running {name} retriever on: {query}")
            query_bundle = QueryBundle(query_str=query)
            retrieved = retriever.retrieve(query_bundle)

            # Store results with titles for better readability
            result = {
                "query": query,
                "titles": [node.node.metadata.get("title") for node in retrieved]
            }
            method_results.append(result)

        results[name] = method_results

    return results

# Display the results function
def display_results(results):
    for query_idx, query in enumerate([r["query"] for r in results["BM25"]]):
        print(f"\n\nQuery: {query}")
        print("-" * 50)

        for method in results:
            titles = results[method][query_idx]["titles"]
            print(f"{method} found: {', '.join(titles)}")

        # Calculate overlap
        bm25_titles = set(results["BM25"][query_idx]["titles"])
        vector_titles = set(results["Vector"][query_idx]["titles"])
        overlap = bm25_titles.intersection(vector_titles)

        print(
            f"Overlap: {len(overlap)} documents ({', '.join(overlap) if overlap else 'None'})")
        print(
            f"Unique to BM25: {', '.join(bm25_titles - vector_titles) if bm25_titles - vector_titles else 'None'}")
        print(
            f"Unique to Vector: {', '.join(vector_titles - bm25_titles) if vector_titles - bm25_titles else 'None'}")


# Run tests
queries = [
    "What is machine learning?",
    "transformer architecture",
    "python neural network code",
    "backpropagation algorithm",
    "information retrieval"
]

retrievers = create_retrievers(documents)
results = compare_retrievers(retrievers, queries)
display_results(results)

Created 13 sample documents
Running BM25 retriever on: What is machine learning?
Running BM25 retriever on: transformer architecture
Running BM25 retriever on: python neural network code
Running BM25 retriever on: backpropagation algorithm
Running BM25 retriever on: information retrieval
Running Vector retriever on: What is machine learning?
Running Vector retriever on: transformer architecture
Running Vector retriever on: python neural network code
Running Vector retriever on: backpropagation algorithm
Running Vector retriever on: information retrieval


Query: What is machine learning?
--------------------------------------------------
BM25 found: Machine Learning Basics, Deep Learning Introduction, Reinforcement Learning, AI Concepts, Hybrid Retrieval
Vector found: Machine Learning Basics, Deep Learning Introduction, Reinforcement Learning, AI Concepts, NLP Fundamentals
Overlap: 4 documents (Deep Learning Introduction, Machine Learning Basics, AI Concepts, Reinforcement Learning)
Un

# Key observations:

1. BM25 performs better on keyword-heavy and technical queries
2. Vector search performs better on semantic queries
3. The overlap between results is often surprisingly small
4. This suggests that combining both methods could yield better results