In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
# pip install langchain langchain_core langchain_openai langchain_community faiss-cpu

In [13]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document



In [14]:
naruto_docs = [
    Document(
        page_content="Naruto dreams of becoming Hokage.\nHe has an unyielding spirit.",
        metadata={"anime": "Naruto", "character": "Naruto Uzumaki"}
    ),
    Document(
        page_content="Sasuke seeks revenge against his brother.\nHe is a skilled Sharingan user.",
        metadata={"anime": "Naruto", "character": "Sasuke Uchiha"}
    ),
    Document(
        page_content="Sakura is determined to prove her strength.\nShe becomes a powerful medical ninja.",
        metadata={"anime": "Naruto", "character": "Sakura Haruno"}
    ),
    Document(
        page_content="Kakashi is known as the Copy Ninja.\nHe mentors Team 7 with wisdom.",
        metadata={"anime": "Naruto", "character": "Kakashi Hatake"}
    ),
    Document(
        page_content="Itachi is a prodigy of the Uchiha clan.\nHe hides his true intentions.",
        metadata={"anime": "Naruto", "character": "Itachi Uchiha"}
    ),
    Document(
        page_content="Hinata is gentle and kind.\nShe admires Naruto deeply.",
        metadata={"anime": "Naruto", "character": "Hinata Hyuga"}
    ),
    Document(
        page_content="Neji is a genius of the Hyuga clan.\nHe believes in destiny but changes his view.",
        metadata={"anime": "Naruto", "character": "Neji Hyuga"}
    ),
    Document(
        page_content="Shikamaru is a brilliant strategist.\nHe often calls things troublesome.",
        metadata={"anime": "Naruto", "character": "Shikamaru Nara"}
    ),
    Document(
        page_content="Choji loves food and values friendship.\nHe uses expansion jutsu in battle.",
        metadata={"anime": "Naruto", "character": "Choji Akimichi"}
    ),
    Document(
        page_content="Ino is confident and competitive.\nShe uses mind transfer techniques.",
        metadata={"anime": "Naruto", "character": "Ino Yamanaka"}
    ),
    Document(
        page_content="Rock Lee cannot use ninjutsu.\nHe masters taijutsu through hard work.",
        metadata={"anime": "Naruto", "character": "Rock Lee"}
    ),
    Document(
        page_content="Gaara was once consumed by hatred.\nHe becomes the Kazekage of his village.",
        metadata={"anime": "Naruto", "character": "Gaara"}
    ),
    Document(
        page_content="Jiraiya is one of the Legendary Sannin.\nHe trains Naruto and writes novels.",
        metadata={"anime": "Naruto", "character": "Jiraiya"}
    ),
    Document(
        page_content="Tsunade is the Fifth Hokage.\nShe excels in medical ninjutsu and strength.",
        metadata={"anime": "Naruto", "character": "Tsunade"}
    ),
    Document(
        page_content="Orochimaru seeks immortality.\nHe performs forbidden experiments.",
        metadata={"anime": "Naruto", "character": "Orochimaru"}
    ),
    Document(
        page_content="Minato is the Fourth Hokage.\nHe is Naruto's father and a sealing master.",
        metadata={"anime": "Naruto", "character": "Minato Namikaze"}
    ),
    Document(
        page_content="Kushina is strong-willed and caring.\nShe is Naruto's mother and a former jinchuriki.",
        metadata={"anime": "Naruto", "character": "Kushina Uzumaki"}
    ),
    Document(
        page_content="Madara is a legendary Uchiha.\nHe aims to cast the Infinite Tsukuyomi.",
        metadata={"anime": "Naruto", "character": "Madara Uchiha"}
    ),
    Document(
        page_content="Obito was once kind-hearted.\nHe falls into darkness after tragedy.",
        metadata={"anime": "Naruto", "character": "Obito Uchiha"}
    ),
    Document(
        page_content="Nagato controls the Six Paths of Pain.\nHe seeks peace through painful means.",
        metadata={"anime": "Naruto", "character": "Nagato (Pain)"}
    ),
]

In [38]:
api_key='your_api_key'

In [39]:
embedding_model = OpenAIEmbeddings(api_key=api_key)

# FAISS Vector store

In [40]:
vector_store = FAISS.from_documents(
     documents =  naruto_docs,
    embedding=embedding_model
)

# 1. MMR Retriever

In [83]:
         #         ┌───────────────────────────┐
         #         │        User Query         │
         #         └─────────────┬─────────────┘
         #                       │
         #                       ▼
         #         ┌───────────────────────────┐
         #         │   Retriever (Vector DB)   │
         #         │  Fetch top-K candidate    │
         #         │  documents (e.g., 50)     │
         #         └─────────────┬─────────────┘
         #                       │
         #                       ▼
         # ┌─────────────────────────────────────────┐
         # │     MMR Scoring Process (Diversity)     │
         # │-----------------------------------------│
         # │  Step 1: Select doc most relevant       │
         # │  Step 2: Re-rank remaining docs based   │
         # │          on balance of:                 │
         # │          - Relevance to query           │
         # │          - Novelty vs already selected  │
         # │  Step 3: Iteratively select next doc    │
         # └───────────────────────┬─────────────────┘
         #                         │
         #                         ▼
         #         ┌───────────────────────────┐
         #         │   Final Set of Documents  │
         #         │  (Diverse + Relevant)     │
         #         └───────────────────────────┘


# 1.1 executing retriever

In [101]:
retriever = vector_store.as_retriever(
    search_type='mmr',
    search_kwargs={"k":5, "lambda_mult":1} # k = no. of outputs, lambda_mult = relevance - diversity balance [0, 1] 0: most diversed, 1 least diversed
)

In [102]:
query="Who is Naruto?"

results = retriever.invoke(query)

for i, doc in enumerate(results):
    print(f"---------- Result : {i+1} ----------")
    print(doc.page_content)

---------- Result : 1 ----------
Jiraiya is one of the Legendary Sannin.
He trains Naruto and writes novels.
---------- Result : 2 ----------
Minato is the Fourth Hokage.
He is Naruto's father and a sealing master.
---------- Result : 3 ----------
Naruto dreams of becoming Hokage.
He has an unyielding spirit.
---------- Result : 4 ----------
Naruto dreams of becoming Hokage.
He has an unyielding spirit.
---------- Result : 5 ----------
Kushina is strong-willed and caring.
She is Naruto's mother and a former jinchuriki.


In [None]:
# output

# ---------- Result : 1 ----------
# Jiraiya is one of the Legendary Sannin.
# He trains Naruto and writes novels.
# ---------- Result : 2 ----------
# Ino is confident and competitive.
# She uses mind transfer techniques.
# ---------- Result : 3 ----------
# Nagato controls the Six Paths of Pain.
# He seeks peace through painful means.

# 2. Multi Query Retriever

In [82]:
#                  ┌───────────────────────────┐
#                  │        User Query         │
#                  └─────────────┬─────────────┘
#                                │
#                                ▼
#                  ┌───────────────────────────┐
#                  │   Query Rewriter (LLM)    │
#                  │ Generates multiple queries│
#                  │   with same intent        │
#                  └─────────────┬─────────────┘
#                                │
#        ┌───────────────────────┼────────────────────────┐
#        ▼                       ▼                        ▼
# ┌──────────────┐       ┌──────────────┐         ┌──────────────┐
# │ Query 1      │       │ Query 2      │   ...   │ Query N      │
# │ (variation)  │       │ (variation)  │         │ (variation)  │
# └───────┬──────┘       └───────┬──────┘         └───────┬──────┘
#         │                      │                          │
#         ▼                      ▼                          ▼
# ┌──────────────┐       ┌──────────────┐         ┌──────────────┐
# │ Retriever    │       │ Retriever    │         │ Retriever    │
# │ (Vector DB)  │       │ (Vector DB)  │         │ (Vector DB)  │
# └───────┬──────┘       └───────┬──────┘         └───────┬──────┘
#         │                      │                          │
#         └──────────────┬───────┴───────────┬──────────────┘
#                        ▼                   ▼
#              ┌─────────────────────────────────────┐
#              │   Aggregate + Deduplicate Results   │
#              └─────────────────┬───────────────────┘
#                                │
#                                ▼
#                  ┌───────────────────────────┐
#                  │   Final Retrieved Docs    │
#                  └───────────────────────────┘


In [36]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

In [33]:
anime_docs_langchain = [
    Document(
        page_content="Naruto dreams of becoming Hokage.\nHe has an unyielding spirit.",
        metadata={"anime": "Naruto", "character": "Naruto Uzumaki"}
    ),
    Document(
        page_content="Luffy seeks the legendary One Piece.\nHe has a rubber body from a Devil Fruit.",
        metadata={"anime": "One Piece", "character": "Monkey D. Luffy"}
    ),
    Document(
        page_content="Light discovers the Death Note.\nHe aims to create a new world of justice.",
        metadata={"anime": "Death Note", "character": "Light Yagami"}
    ),
    Document(
        page_content="Eren vows to eradicate all Titans.\nHe possesses the power of the Attack Titan.",
        metadata={"anime": "Attack on Titan", "character": "Eren Yeager"}
    ),
    Document(
        page_content="Ichigo becomes a Substitute Soul Reaper.\nHe protects both humans and spirits.",
        metadata={"anime": "Bleach", "character": "Ichigo Kurosaki"}
    ),
    Document(
        page_content="Goku is a Saiyan raised on Earth.\nHe strives to become the strongest fighter.",
        metadata={"anime": "Dragon Ball Z", "character": "Goku"}
    )
]

**adding more documents to the existing vector store**

In [41]:
vector_store.add_documents(anime_docs_langchain)

['964349f0-192c-4422-aeeb-36129f473587',
 'd00358fc-f840-442b-a8bc-3d01a608fb18',
 '336b3b4f-4122-474d-a2d1-49e49f7472a4',
 'a8fc720a-9e82-427a-a6aa-b7d007afc9fb',
 '38c837c0-e1ae-46f9-b14f-3051f1922111',
 'fa207319-956e-4755-a3c5-e3a8f9ed1e93']

# 2.1 basic similarity_retriever

In [55]:
similarity_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k":5}
)

# 2.2 multi-query retriever

In [56]:
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(
        search_kwargs={"k":5}
    ),
    llm=ChatOpenAI(api_key=api_key)
)

In [57]:
# ambigous query and not direct linked to the documents in the vector store
query = "which character has leadershp spirit?"

In [58]:
similarity_result = similarity_retriever.invoke(query)
multiquery_result = multiquery_retriever.invoke(query)

In [59]:
print("Similariy result")
for i, doc in enumerate(similarity_result):
    print(f"--- Result {i} ----")
    print(doc)


print("MultiQuery result")
for i, doc in enumerate(multiquery_result):
    print(f"--- Result {i} ----")
    print(doc)

Similariy result
--- Result 0 ----
page_content='Naruto dreams of becoming Hokage.
He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki'}
--- Result 1 ----
page_content='Naruto dreams of becoming Hokage.
He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki'}
--- Result 2 ----
page_content='Ichigo becomes a Substitute Soul Reaper.
He protects both humans and spirits.' metadata={'anime': 'Bleach', 'character': 'Ichigo Kurosaki'}
--- Result 3 ----
page_content='Kakashi is known as the Copy Ninja.
He mentors Team 7 with wisdom.' metadata={'anime': 'Naruto', 'character': 'Kakashi Hatake'}
--- Result 4 ----
page_content='Goku is a Saiyan raised on Earth.
He strives to become the strongest fighter.' metadata={'anime': 'Dragon Ball Z', 'character': 'Goku'}
MultiQuery result
--- Result 0 ----
page_content='Naruto dreams of becoming Hokage.
He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki

In [None]:
# output

# Similariy result
# --- Result 0 ----
# page_content='Naruto dreams of becoming Hokage.
# He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki'}
# --- Result 1 ----
# page_content='Naruto dreams of becoming Hokage.
# He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki'}
# --- Result 2 ----
# page_content='Ichigo becomes a Substitute Soul Reaper.
# He protects both humans and spirits.' metadata={'anime': 'Bleach', 'character': 'Ichigo Kurosaki'}
# --- Result 3 ----
# page_content='Kakashi is known as the Copy Ninja.
# He mentors Team 7 with wisdom.' metadata={'anime': 'Naruto', 'character': 'Kakashi Hatake'}
# --- Result 4 ----
# page_content='Goku is a Saiyan raised on Earth.
# He strives to become the strongest fighter.' metadata={'anime': 'Dragon Ball Z', 'character': 'Goku'}
# MultiQuery result
# --- Result 0 ----
# page_content='Naruto dreams of becoming Hokage.
# He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki'}
# --- Result 1 ----
# page_content='Naruto dreams of becoming Hokage.
# He has an unyielding spirit.' metadata={'anime': 'Naruto', 'character': 'Naruto Uzumaki'}
# --- Result 2 ----
# page_content='Shikamaru is a brilliant strategist.
# He often calls things troublesome.' metadata={'anime': 'Naruto', 'character': 'Shikamaru Nara'}
# --- Result 3 ----
# page_content='Sakura is determined to prove her strength.
# She becomes a powerful medical ninja.' metadata={'anime': 'Naruto', 'character': 'Sakura Haruno'}
# --- Result 4 ----
# page_content='Kushina is strong-willed and caring.
# She is Naruto's mother and a former jinchuriki.' metadata={'anime': 'Naruto', 'character': 'Kushina Uzumaki'}
# --- Result 5 ----
# page_content='Ichigo becomes a Substitute Soul Reaper.
# He protects both humans and spirits.' metadata={'anime': 'Bleach', 'character': 'Ichigo Kurosaki'}
# --- Result 6 ----
# page_content='Ino is confident and competitive.
# She uses mind transfer techniques.' metadata={'anime': 'Naruto', 'character': 'Ino Yamanaka'}
# --- Result 7 ----
# page_content='Kakashi is known as the Copy Ninja.
# He mentors Team 7 with wisdom.' metadata={'anime': 'Naruto', 'character': 'Kakashi Hatake'}

# 3. Contextual Compression Retriever

In [81]:
   # ┌─────────────────┐
   # │  Vector Store   │  (many documents)
   # └───────┬─────────┘
   #         │
   #         ▼
   # ┌─────────────────┐
   # │ Base Retriever  │   → fetches top-N documents (e.g., 20)
   # └───────┬─────────┘
   #         │
   #         ▼
   # ┌─────────────────────────────┐
   # │ Document Compressor         │   → summarizes / filters / ranks docs
   # │ (e.g., LLMChain, Filter,    │
   # │  Embedding Reranker, etc.)  │
   # └───────┬─────────────────────┘
   #         │
   #         ▼
   # ┌──────────────────────────┐
   # │ Compression Retriever    │   → returns compressed set (e.g., top-5)
   # └──────────────────────────┘


In [61]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# 3.1 setup base retriever and llm and compressor

In [84]:
# setup base retriever and llm and compressor to be used after document extract
base_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

llm = ChatOpenAI(api_key=api_key)

compressor =  LLMChainExtractor.from_llm(llm)

# 3.2 initialising compression retriever

In [103]:
# compression_retriever = ContextualCompressionRetriever(
#     base_retriever=base_retriever,
#     base_compressor=compressor
# )

# we can use mmr as base retriever
compression_retriever_2 = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=compressor
)


In [104]:
query = "List all the characters's"

compression_results = compression_retriever_2.invoke(query)

In [99]:
print(compression_results)

[Document(metadata={'anime': 'Dragon Ball Z', 'character': 'Goku'}, page_content='Goku is a Saiyan raised on Earth.\nHe strives to become the strongest fighter.'), Document(metadata={'anime': 'Naruto', 'character': 'Ino Yamanaka'}, page_content='Ino is confident and competitive.\nShe uses mind transfer techniques.')]


In [105]:
for i, doc in enumerate(compression_results):
    print(f"--- Result {i} ----")
    print(doc)

--- Result 0 ----
page_content='Goku is a Saiyan raised on Earth.' metadata={'anime': 'Dragon Ball Z', 'character': 'Goku'}
--- Result 1 ----
page_content='Choji loves food and values friendship.
He uses expansion jutsu in battle.' metadata={'anime': 'Naruto', 'character': 'Choji Akimichi'}
--- Result 2 ----
page_content='Luffy seeks the legendary One Piece.
He has a rubber body from a Devil Fruit.' metadata={'anime': 'One Piece', 'character': 'Monkey D. Luffy'}


In [None]:
# output 

# --- Result 0 ----
# page_content='Kazekage' metadata={'anime': 'Naruto', 'character': 'Gaara'}
# --- Result 1 ----
# page_content='Fifth Hokage' metadata={'anime': 'Naruto', 'character': 'Tsunade'}