<a href="https://colab.research.google.com/github/harkirat-singh2/GenAI/blob/master/Langchain_retrievers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Wikipedia Retriever


In [None]:
!pip install -U langchain langchain-huggingface sentence-transformers langchain_community




In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# Create embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from langchain_community.retrievers import WikipediaRetriever



In [None]:
!pip install -U langchain langchain-community wikipedia





In [None]:
retriever = WikipediaRetriever(
    top_k_results=2,   # ✅ correct parameter name
    lang="en"
)

In [None]:
query=' The geopolitical history of India and Pakistan From the point of view of Chinese'
doc = retriever.invoke(query)

In [None]:
# Print retrieved content
for i, doc in enumerate(doc):
  print(f"\n --- Result {i+1} --- ")
  print(f"Content:\n{doc.page_content} ... ") # truncate for display


 --- Result 1 --- 
Content:
The India–Pakistan war of 1965, also known as the second India–Pakistan war, was an armed conflict between Pakistan and India that took place from August 1965 to September 1965.
The conflict began following Pakistan's unsuccessful Operation Gibraltar, which was designed to infiltrate forces into Jammu and Kashmir to precipitate an insurgency against Indian rule. The seventeen day war caused thousands of casualties on both sides and witnessed the largest engagement of armoured vehicles and the largest tank battle since World War II. Hostilities between the two countries ended after a ceasefire was declared through UNSC Resolution 211 following a diplomatic intervention by the Soviet Union and the United States, and the subsequent issuance of the Tashkent Declaration. Much of the war was fought by the countries' land forces in Kashmir and along the border between India and Pakistan. This war saw the largest amassing of troops in Kashmir since the Partition of

# Vector Store Retriever


In [None]:
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

In [None]:
# Step 1: Your source documents
documents = [
Document (page_content="LangChain helps developers build LLM applications easily."),
Document(page_content="Chroma is a vector database optimized for LLM-based search."),
Document (page_content="Embeddings convert text into high-dimensional vectors."),
Document(page_content="OpenAI provides powerful embedding models."),

]

In [None]:
embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
!pip install chromadb
from typing import Collection
from langchain_community.vectorstores import chroma

vector_store = Chroma.from_documents(documents=documents, embedding = embedding_model, collection_name="my_collection")




In [None]:
# Convert vector Store to retriever
retriever = vector_store.as_retriever(search_kwargs={"k":2})

In [None]:
query = "What is Chroma used for?"
results = retriever.invoke(query)

for i, doc in enumerate(results):
  print(f"\n --- Result {i+1} --- ")
  print(doc.page_content)


 --- Result 1 --- 
Chroma is a vector database optimized for LLM-based search.

 --- Result 2 --- 
LangChain helps developers build LLM applications easily.


#MMMA Retrievers

In [None]:
# Sample documents
docs = [
Document(page_content="LangChain makes it easy to work with LLMs."),
Document(page_content="LangChain is used to build LLM based applications."),
Document(page_content="Chroma is used to store and search document embeddings."),
Document(page_content="Embeddings are vector representations of text."),
Document(page_content="MMR helps ou get diverse results when doing similarity search."),
Document(page_content="LangChain supports Chroma, FAISS, Pinecone, and more."),

]

In [None]:
from langchain_community.vectorstores import FAISS
!pip install faiss-cpu


# Initialize OpenAI embeddings
embedding_model = HuggingFaceEmbeddings()

# Step 2: Create the FAISS vector store from documents
vectorstore = FAISS. from_documents(
documents=docs,
embedding=embedding_model

)

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [None]:
# Enable MMR in the retriever
retriever = vectorstore.as_retriever(
search_type="mmr",   # <-- This enables MMR
search_kwargs={"k": 3, "lambda_mult": 0.5} # k = top results, lambda_mult = relevance-diversity balance



)

In [None]:
query = "What is langchain?"
results = retriever.invoke(query)

for i, doc in enumerate(results):
  print(f"\n --- Result {i+1} --- ")
  print(doc.page_content)


 --- Result 1 --- 
LangChain is used to build LLM based applications.

 --- Result 2 --- 
Embeddings are vector representations of text.

 --- Result 3 --- 
LangChain supports Chroma, FAISS, Pinecone, and more.


#Multiquery Retriever

In [None]:
!pip install -U transformers accelerate sentence-transformers faiss-cpu \
langchain langchain-community langchain-classic langchain-huggingface



Collecting transformers
  Using cached transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-huggingface
  Using cached langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
INFO: pip is still looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
  Downloading langchain_huggingface

In [None]:
# ===============================
# IMPORTS
# ===============================
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers import MultiQueryRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
import torch

# ===============================
# STEP 1: Create Sample Documents
# ===============================
all_docs = [
    Document(page_content="Regular walking boosts heart health and can reduce symptoms of depression.", metadata={"source": "H1"}),
    Document(page_content="Consuming leafy greens and fruits helps detox the body and improve longevity.", metadata={"source": "H2"}),
    Document(page_content="Deep sleep is crucial for cellular repair and emotional regulation.", metadata={"source": "H3"}),
    Document(page_content="Mindfulness and controlled breathing lower cortisol and improve mental clarity.", metadata={"source": "H4"}),
    Document(page_content="Drinking sufficient water throughout the day helps maintain metabolism and energy.", metadata={"source": "H5"}),
    Document(page_content="The solar energy system in modern homes helps balance electricity demand.", metadata={"source": "I1"}),
    Document(page_content="Python balances readability with power, making it a popular system design language.", metadata={"source": "I2"}),
    Document(page_content="Photosynthesis enables plants to produce energy by converting sunlight.", metadata={"source": "I3"}),
    Document(page_content="The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.", metadata={"source": "I4"}),
    Document(page_content="Black holes bend spacetime and store immense gravitational energy.", metadata={"source": "I5"}),
]

# ===============================
# STEP 2: Create Embeddings
# ===============================
embedding_model = HuggingFaceEmbeddings()

vectorstore = FAISS.from_documents(
    documents=all_docs,
    embedding=embedding_model
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

# ===============================
# STEP 3: Load HuggingFace LLM
# ===============================
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_new_tokens=150,
    device=0 if torch.cuda.is_available() else -1
)

llm = HuggingFacePipeline(pipeline=pipe)

# ===============================
# STEP 4: MultiQuery Retriever
# ===============================
multi_retriever = MultiQueryRetriever.from_llm(
    retriever=retriever,
    llm=llm
)

# ===============================
# STEP 5: Ask Question
# ===============================
question = "How can I improve my energy naturally?"

retrieved_docs = multi_retriever.invoke(question)

print("🔎 Retrieved Documents:\n")
for doc in retrieved_docs:
    print("-", doc.page_content)

# ===============================
# STEP 6: Generate Final Answer
# ===============================
context = "\n".join([doc.page_content for doc in retrieved_docs])

prompt = ChatPromptTemplate.from_template(
    """Answer the question based only on the context below.

Context:
{context}

Question:
{question}
"""
)

chain = prompt | llm | StrOutputParser()

final_answer = chain.invoke({
    "context": context,
    "question": question
})

print("\n💡 Final Answer:\n")
print(final_answer)


