In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers.ensemble import EnsembleRetriever
from langchain_chroma import Chroma
from langchain_core.documents import Document
from dotenv import load_dotenv

load_dotenv()


True

In [9]:
%pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
# --------------------------------------------------------------------
# SETUP: Create our sample company data
# --------------------------------------------------------------------

chunks = [
    "Microsoft acquired GitHub for 7.5 billion dollars in 2018.",
    "Tesla Cybertruck production ramp begins in 2024.",
    "Google is a large technology company with global operations.",
    "Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing...",
    "SpaceX develops Starship rockets for Mars missions.",
    "The tech giant acquired the code repository platform for software development.",
    "NVIDIA designs Starship architecture for their new GPUs.",
    "Tesla Tesla financial quarterly results improved significantly.",
    "Cybertruck reservations exceeded company expectations.",
    "Microsoft is a large technology company with global operations.",
    "Apple announced new iPhone features for developers.",
    "The apple orchard harvest was excellent this year.",
    "Python programming language is widely used in AI.",
    "The python snake can grow up to 20 feet long.",
    "Java coffee beans are imported from Indonesia.",
    "Java programming requires understanding of object-oriented concepts.",
    "Orange juice sales increased during winter months.",
    "Orange County reported new housing developments."
]


In [11]:
# Convert to documents objects for langchain
documents = [Document(page_content=chunk, metadata={"source":f"chunk_{i}"}) for i,chunk in enumerate(chunks)]
print("Sample Data:")
for i,chunk in enumerate(chunks,1):
    print(f"Chunk {i}: {chunk}")
print("\n"+"="*80)

Sample Data:
Chunk 1: Microsoft acquired GitHub for 7.5 billion dollars in 2018.
Chunk 2: Tesla Cybertruck production ramp begins in 2024.
Chunk 3: Google is a large technology company with global operations.
Chunk 4: Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing...
Chunk 5: SpaceX develops Starship rockets for Mars missions.
Chunk 6: The tech giant acquired the code repository platform for software development.
Chunk 7: NVIDIA designs Starship architecture for their new GPUs.
Chunk 8: Tesla Tesla financial quarterly results improved significantly.
Chunk 9: Cybertruck reservations exceeded company expectations.
Chunk 10: Microsoft is a large technology company with global operations.
Chunk 11: Apple announced new iPhone features for developers.
Chunk 12: The apple orchard harvest was excellent this year.
Chunk 13: Python programming language is widely used in AI.
Chunk 14: The python snake can grow up to 20 feet

# Vector Retriever

In [12]:
print("Setting up the Vector Retriever..")
embedding_model = HuggingFaceEmbeddings(
    model = "sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = Chroma.from_documents(
    documents = documents,
    embedding = embedding_model,
    collection_metadata={"hnsw:space": "cosine"}
)

Setting up the Vector Retriever..


In [14]:
vector_retriever = vectorstore.as_retriever(search_kwargs={"k":2})
test_query = "space exploration company"
print(f"Testing '{test_query}'")
test_docs = vector_retriever.invoke(test_query)
for doc in test_docs:
    print(f"Found : {doc.page_content}")

Testing 'space exploration company'
Found : SpaceX develops Starship rockets for Mars missions.
Found : Google is a large technology company with global operations.


# BM2.5 Retriver

In [15]:
print("Setting up BM2.5 Retriever")
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k=3

Setting up BM2.5 Retriever


In [16]:
test_query = "Tesla"
print(f"Testing '{test_query}'")
test_docs= bm25_retriever.invoke(test_query)
for doc in test_docs:
    print(f"Found: {doc.page_content}")

Testing 'Tesla'
Found: Tesla Tesla financial quarterly results improved significantly.
Found: Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing...
Found: Tesla Cybertruck production ramp begins in 2024.


# Hybrid Retriever (Combination)

In [19]:
print("setting up Hybrid Retriver")
hybrid_retriever = EnsembleRetriever(
    retrievers = [vector_retriever,bm25_retriever],
    weights = [0.5,0.5]
)
print("Setup Complete")

setting up Hybrid Retriver
Setup Complete


In [20]:
# Query 1: Mixed semantic and exact terms

# Vector search understands "purchase cost" semantically
# BM25 search finds exact "7.5 billion" 
# Hybrid should combine both strengths for best result
test_query = "purchase cost 7.5 billion"

retrieved_chunks = hybrid_retriever.invoke(test_query)
for i, doc in enumerate(retrieved_chunks, 1):
    print(f"{i}. {doc.page_content}")
print()

print("Query 1 shows how hybrid finds exact financial info using both semantic understanding and keyword matching")

1. Microsoft acquired GitHub for 7.5 billion dollars in 2018.
2. Microsoft is a large technology company with global operations.
3. Orange County reported new housing developments.
4. Orange juice sales increased during winter months.

Query 1 shows how hybrid finds exact financial info using both semantic understanding and keyword matching


In [22]:

# Query 2: Semantic concept + specific product name  

# Vector search understands "electric vehicle manufacturing"
# BM25 search finds exact "Cybertruck"
# Hybrid gets the best of both worlds

test_query = "electric vehicle manufacturing Cybertruck"

retrieved_chunks = hybrid_retriever.invoke(test_query)

for i, doc in enumerate(retrieved_chunks, 1):
    print(f"{i}. {doc.page_content}")
print()

print("Query 2 demonstrates combining product-specific terms with broader concepts")

1. Tesla Cybertruck production ramp begins in 2024.
2. Cybertruck reservations exceeded company expectations.
3. Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing...

Query 2 demonstrates combining product-specific terms with broader concepts


In [24]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
import os


combined_input = f"""Based on the following documents, please answer this question: {test_query}

Documents:
{chr(10).join([f"- {doc.page_content}" for doc in retrieved_chunks])}

Please provide a clear, helpful answer using only the information from these documents. If you can't find the answer in the documents, say "I don't have enough information to answer that question based on the provided documents."
"""

# Create a ChatOpenAI model
model = ChatOpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
    model="openai/gpt-4o-mini",
    temperature=0.3
)

# Define the messages for the model
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content=combined_input),
]

# Invoke the model with the combined input
result = model.invoke(messages)

# Display the full result and content only
print("\n--- Generated Response ---")
# print("Full result:")
# print(result)
print("Content only:")
print(result.content)


--- Generated Response ---
Content only:
The Tesla Cybertruck production ramp is set to begin in 2024, and the reservations for the Cybertruck have exceeded the company's expectations. Tesla continues to lead in electric vehicles and has reported strong quarterly results, along with announcements regarding new manufacturing developments.
