# Vector Store Retriever 

In [17]:
# Import libraries

from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document
from dotenv import load_dotenv

In [18]:
load_dotenv()

True

In [5]:
# Step 1: Your Source Documents

documents = [
    Document(page_content = "Langchain helps developers build LLM applications easily"),
    Document(page_content = "Chroma is a vector database optimized for LLM-based search"),
    Document(page_content = 'Embeddings convert text into high-dimensional vectors'),
    Document(page_content = 'GenAI provides powerful embedding models.'),
]

In [13]:
# Step 2 : Intialize embedding model

embedding_model = GoogleGenerativeAIEmbeddings(model = 'models/gemini-embedding-001')

In [14]:
# Step 3 : Create Chroma vector store in memory

vectorstore = Chroma.from_documents(
    documents = documents,
    embedding = embedding_model,
    collection_name = 'my_collection'
)

In [16]:
# Step 4: Convert vectorstore into a retriever

retriever = vectorstore.as_retriever(search_kwargs={"k":2})

In [17]:
query  =  "What is chroma used for"
results = retriever.invoke(query)

In [18]:
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1}---")
    print(doc.page_content)


--- Result 1---
Chroma is a vector database optimized for LLM-based search

--- Result 2---
GenAI provides powerful embedding models.


### we get different different search strategy implimentation too here

## Maximum Marginal Relevance (MMR)

1. "**How can we pick results that are not only relevant to the query but also different from each other**"
2. MMR is an information retieval algorithm designed to reduce redundancy in the retrieved results while maintaining high relevance to the query.

### Why MMR Retriever

In regular similarity search you may get documents that are 
1. All very similar to each other.
2. Repeating the same info
3. Lacking deliver perspectives

MMR Review avoids that by
1.  Picking the most relevant document first
2. Then picking the **most relevant and least similar** to already docs
3. and so on

This helps especially in RAG pipelines where
1. You want your context window to contain diverse but still relevant information
2. Especially useful when documents are **semantically overlapping**.


## Practical in Python (MMR)

In [5]:
from langchain_core.documents import Document

In [6]:
# sample documents

docs = [
    Document(page_content = "Langchain makes it easy to work with LLMs."),
    Document(page_content = "Langchain is used to buld LLM based applications"),
    Document(page_content = "Chroma is used to store and search document embeddings"),
    Document(page_content = 'Embeddings are vector represetation of text.'),
    Document(page_content = 'Embeddings are vector representations of text.'),
    Document(page_content = "MMR helps you get diverse results when doing similarity search."),
    Document(page_content = "Langchain supports Chroma, FAISS, Pinecone, and more.")

]

In [7]:
from langchain_community.vectorstores import FAISS

In [19]:
# Initialize GoogleGenAI embeddings

embedding_model = GoogleGenerativeAIEmbeddings(
    model = 'models/gemini-embedding-001'
)

In [20]:
# Step 2 -> Create the FAISS store from documents

vectorstore = FAISS.from_documents(
    documents = docs,
    embedding= embedding_model
)

In [25]:
# Step 3 -> Enable MMR in the retriever
retriever = vectorstore.as_retriever(
    search_type = 'mmr',
    search_kwargs = {'k' : 3, 'lambda_mult': 1} # k = top results, lambda_mult = relevance-diversity balance
) # less the lamba_mult, more the diversity

In [23]:
query = "What is langchain ?"
results = retriever.invoke(query)

In [24]:
for i, doc in enumerate(results):
    print(f"\n --- Result{i+1} ---")
    print(doc.page_content)


 --- Result1 ---
Langchain is used to buld LLM based applications

 --- Result2 ---
Langchain makes it easy to work with LLMs.

 --- Result3 ---
Langchain supports Chroma, FAISS, Pinecone, and more.


# 3. Multi-Query Retriever

#### Sometimes a single query might not capture all the ways information in phrased in your documents

For Example:

Query :
"How can i stay health"

Could mean 

. What should i eat 