#### **Agentic RAG application using LangGraph**




In [None]:
pip install langchain_community langchain langchain_huggingface langchain-core langgraph langchain_google_genai pypdf faiss-cpu



In [None]:
from google.colab import userdata
api_key = userdata.get('GOOGLE_API_KEY')

In [25]:
import os
from typing import TypedDict, List, Literal
from langchain_core.messages import HumanMessage, AIMessage
from langgraph.graph import StateGraph, START, END
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document


# Configuration
DEFAULT_PDF_DIRECTORY = "/content/data"
FAISS_INDEX_PATH = "./faiss_index"


# Enhanced State Definition
class AgentState(TypedDict):
    messages: list
    mode: str
    pdf_directory: str
    response: str
    source_documents: List[Document]


# Initialize LLM with streaming enabled
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0.3,
    api_key=api_key,
    #max_output_tokens= 1000,
    streaming=True
)


# Initialize embeddings globally
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


# Node 1: Direct LLM
def direct_llm_node(state: AgentState):
    """Direct interaction with LLM with conversation history"""
    messages = state["messages"]
    response = llm.invoke(messages)

    return {
        "response": response.content,
        "messages": messages + [AIMessage(content=response.content)],
        "source_documents": []
    }


# Node 2: RAG Pipeline with conversation history
def rag_pipeline_node(state: AgentState):
    """Complete RAG pipeline with conversation history support"""
    messages = state["messages"]
    current_query = messages[-1].content

    pdf_directory = state.get("pdf_directory", DEFAULT_PDF_DIRECTORY) or DEFAULT_PDF_DIRECTORY

    # Load or create FAISS index
    if os.path.exists(FAISS_INDEX_PATH):
        print("Loading existing FAISS index...")
        vectorstore = FAISS.load_local(
            FAISS_INDEX_PATH,
            embeddings,
            allow_dangerous_deserialization=True
        )
    else:
        print(f"Creating new FAISS index from directory: {pdf_directory}")

        # Load all PDFs from directory
        loader = PyPDFDirectoryLoader(
            path=pdf_directory,
            glob="**/*.pdf",  # Load all PDFs recursively
            recursive=True,    # Search subdirectories
            silent_errors=False  # Show errors if any
        )

        documents = loader.load()
        print(f" Loaded {len(documents)} pages from PDF files in {pdf_directory}")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=150
        )
        chunks = text_splitter.split_documents(documents)
        print(f" Created {len(chunks)} chunks from documents")

        vectorstore = FAISS.from_documents(chunks, embeddings)
        vectorstore.save_local(FAISS_INDEX_PATH)
        print(f" FAISS index saved to {FAISS_INDEX_PATH}")

    # Retrieval
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    relevant_docs = retriever.invoke(current_query)
    context = "\n\n".join([doc.page_content for doc in relevant_docs])

    # Build conversation history for context
    conversation_history = []
    for msg in messages[:-1]:  # Exclude current query
        if isinstance(msg, HumanMessage):
            conversation_history.append(f"User: {msg.content}")
        elif isinstance(msg, AIMessage):
            conversation_history.append(f"Assistant: {msg.content}")

    history_text = "\n".join(conversation_history) if conversation_history else "No previous conversation."

    # Enhanced prompt with conversation history
    prompt = f"""Based on the conversation history and context below, answer the current question.

    Conversation History:
    {history_text}

    Context from documents:
    {context}

    Current Question: {current_query}

    Answer (be specific and helpful):"""

    response = llm.invoke(prompt)

    return {
        "response": response.content,
        "messages": messages + [AIMessage(content=response.content)],
        "source_documents": relevant_docs
    }


# Routing Function
def route_query(state: AgentState) -> Literal["direct_llm", "rag_pipeline"]:
    """Route based on mode"""
    if state["mode"] == "rag":
        return "rag_pipeline"
    return "direct_llm"


# Build Graph
workflow = StateGraph(AgentState)
workflow.add_node("direct_llm", direct_llm_node)
workflow.add_node("rag_pipeline", rag_pipeline_node)

workflow.add_conditional_edges(
    START,
    route_query,
    {
        "direct_llm": "direct_llm",
        "rag_pipeline": "rag_pipeline"
    }
)

workflow.add_edge("direct_llm", END)
workflow.add_edge("rag_pipeline", END)

# Compile
app = workflow.compile()


#  CHAT FUNCTION
conversation_history = []


def chat(query: str, mode: str = "rag", pdf_directory: str = ""):
    """
    Chat function that maintains conversation history

    Args:
        query: User's question
        mode: "rag" or "direct"
        pdf_directory: Path to directory containing PDFs (empty for default)

    Returns:
        dict with response and source_documents
    """
    # Add user message to history
    conversation_history.append(HumanMessage(content=query))

    # Invoke with full conversation history
    result = app.invoke({
        "messages": conversation_history,
        "mode": mode,
        "pdf_directory": pdf_directory,
        "response": "",
        "source_documents": []
    })

    # Add AI response to history
    conversation_history.append(AIMessage(content=result["response"]))

    return result


def reset_conversation():
    """Clear conversation history to start fresh"""
    global conversation_history
    conversation_history = []
    print(" Conversation history cleared!")


def reset_faiss_index():
    """Delete FAISS index to force recreation from PDFs"""
    import shutil
    if os.path.exists(FAISS_INDEX_PATH):
        shutil.rmtree(FAISS_INDEX_PATH)
        print(f" FAISS index deleted from {FAISS_INDEX_PATH}")
    else:
        print(" No FAISS index found to delete")


In [34]:
# Example 1 - Direct LLM

print("Example 1: Direct LLM Mode")


result0 = chat("What is Generative AI", mode="direct")

print("\n Response: ", end="")
for char in result0["response"]:
    print(char, end="", flush=True)
print("\n")

Example 1: Direct LLM Mode

 Response: Of course! Let's cover that. We discussed it at the beginning of our conversation, but it's a foundational topic worth revisiting.

### The Simple Analogy

Imagine you're teaching a child about dogs.

*   **Traditional AI:** You show them 1,000 photos and ask, "Is this a dog?" for each one. Eventually, they learn to *recognize* a dog.
*   **Generative AI:** You show them 1,000 photos of dogs. After studying them all, you give them a blank piece of paper and say, "Now, *draw a new dog* that has never existed before."

**Generative AI** is the system that can draw that new dog. It learns the underlying patterns, features, and "essence" of the data it's trained on, and then uses that knowledge to **create new, original content**.

---

### The Formal Definition

**Generative AI** is a branch of artificial intelligence that can generate novel content, including text, images, audio, code, and synthetic data. Unlike traditional AI models that are design

In [35]:
# Example 2 - Direct LLM - follow-up question

print("Example 2: Direct LLM Follow-up")


result00 = chat("How it was different from traditional machine learning?", mode="direct")

print("\n Response: ", end="")
for char in result00["response"]:
    print(char, end="", flush=True)
print("\n")

Example 2: Direct LLM Follow-up

 Response: Of course. This is a crucial distinction that highlights why Generative AI is such a significant leap forward. We've touched on this before, and it's a great concept to solidify.

The fundamental difference lies in their **primary purpose and output**.

*   **Traditional Machine Learning** is primarily **discriminative**. Its goal is to **analyze existing data to make a prediction or classification**. It learns to *distinguish* between different categories.
*   **Generative AI** is, as its name implies, **generative**. Its goal is to **learn from existing data to create new, original data** that has similar characteristics.

Let's use a simple analogy:

*   **Traditional ML is like a food critic.** A critic tastes a dish (input data) and gives a verdict (output), such as "This is a high-quality pasta" (classification) or "I rate this 8/10" (prediction). The critic can judge, but they can't create a new dish from scratch.
*   **Generative AI i

In [36]:
# Example 3: First RAG query
print("Example 3: RAG Mode (Multiple PDFs)")

result1 = chat("What is Low rank adaptation?", mode="rag")

print("\n Response: ", end="")
for char in result1["response"]:
    print(char, end="", flush=True)

print("\n\n--- Source Documents ---")
for idx, doc in enumerate(result1["source_documents"], 1):
    print(f"\n Source {idx}:")
    print(f"   Page: {doc.metadata.get('page', 'N/A')}")
    print(f"   File: {doc.metadata.get('source', 'N/A')}")
    print(f"   Content: {doc.page_content[:250]}...")

Example 3: RAG Mode (Multiple PDFs)
Loading existing FAISS index...

 Response: Based on the provided context, here is a specific and helpful explanation of Low-Rank Adaptation.

**Low-Rank Adaptation (LoRA)** is a technique designed to efficiently fine-tune Large Language Models (LLMs). Its primary goal is to adapt a pre-trained model to a new task while significantly reducing memory usage and computational cost.

According to the document, LoRA is a **parameter-efficient fine-tuning (PEFT)** method. Here’s how it works:

1.  **Freezes the Original Model:** Instead of retraining all the billions of parameters in a Large Language Model (which is incredibly memory-intensive), LoRA freezes the original model.

2.  **Adds Small, Trainable Layers:** It then introduces a small number of new, trainable parameters by adding **"low-rank matrix adaptations"** to the model's existing layers. These new layers are much smaller than the original ones.

3.  **Efficient Training:** Only these small, 

In [37]:
# Example 4: Follow-up question (uses conversation history!)

print("Example 4: Follow-up Question")


result2 = chat("How it was different from Quantized LORA?", mode="rag")

print("\n Response: ", end="")
for char in result2["response"]:
    print(char, end="", flush=True)
print("\n")

print("\n--- Source Documents ---")
for idx, doc in enumerate(result2["source_documents"], 1):
    print(f"\n Source {idx}:")
    print(f"   Page: {doc.metadata.get('page', 'N/A')}")
    print(f"   File: {doc.metadata.get('source', 'N/A')}")
    print(f"   Content: {doc.page_content[:250]}...")

Example 4: Follow-up Question
Loading existing FAISS index...

 Response: Excellent question. Based on the provided context, the key difference is that **QLoRA is an enhanced version of LoRA that adds quantization to save even more memory.**

Think of it this way:

*   **LoRA** is a clever technique to avoid training the whole model.
*   **QLoRA** takes that clever technique and adds another layer of optimization on top of it.

Here is a more detailed breakdown based on the documents:

### The Core Difference: Adding Quantization

1.  **LoRA (Low-Rank Adaptation):**
    *   **What it does:** It freezes the massive pre-trained model and adds small, trainable "adapter" layers.
    *   **Its Goal:** To fine-tune the model for a new task without the huge memory cost of training all the original parameters. It focuses on making the *training process* more efficient.

2.  **QLoRA (Quantized Low-Rank Adaptation):**
    *   **What it does:** It does everything LoRA does, **PLUS** it compresses

In [38]:
# Example 5: Follow-up question (uses conversation history!)

print("Example 5: Follow-up Question")


result3 = chat("What was the last question I asked you?", mode="direct")

print("\n Response: ", end="")
for char in result3["response"]:
    print(char, end="", flush=True)
print("\n")

print("\n--- Source Documents ---")
for idx, doc in enumerate(result3["source_documents"], 1):
    print(f"\n Source {idx}:")
    print(f"   Page: {doc.metadata.get('page', 'N/A')}")
    print(f"   File: {doc.metadata.get('source', 'N/A')}")
    print(f"   Content: {doc.page_content[:250]}...")

Example 5: Follow-up Question

 Response: The last question you asked me was:

**"How it was different from Quantized LORA?"**


--- Source Documents ---
