#  Gemini RAG Knowledge Engine
### A Full-Stack Retrieval-Augmented Generation (RAG) Application

**Author:** Karthik K
**Tech Stack:** Google Gemini 1.5 Flash, LangChain, ChromaDB

**Project Description:**
This notebook builds an end-to-end RAG pipeline. It ingests custom PDF/TXT documents, chunks them, embeds them into a vector database, and uses the Gemini 1.5 Flash model to answer user queries based specifically on that data. The final output is a deployed Streamlit web application.

## **Environment Setup**
Installing the necessary libraries for the RAG pipeline.
* `langchain`: Orchestration framework.
* `chromadb`: Vector database for storing document embeddings.
* `sentence-transformers`: Open-source embedding model.
* `google-generativeai`: SDK for Gemini 1.5 Flash.

In [None]:
!pip install chromadb sentence-transformers

In [None]:
!pip install -U langchain-google-genai google-generativeai

In [None]:
!pip install google-generativeai

In [None]:
!pip install google-genai

In [None]:
pip install -U langchain-google-genai

In [None]:
!pip install streamlit

In [None]:
!npm install -g localtunnel

In [None]:
!pip install -q pyngrok

# langchain setup

In [None]:
!pip install -U langchain

In [None]:
!pip install -U langchain langchain-google-genai

In [None]:
!pip install langchain_community

In [None]:
!pip install -qU langchain langchain-huggingface

In [None]:
!pip install pypdf

# Necessary  Imports

In [None]:
# Chains
from langchain_classic.chains import RetrievalQA
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_classic.memory.buffer import ConversationBufferMemory

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


import os
from google.colab import userdata


from langchain_google_genai import ChatGoogleGenerativeAI

## **The Main Application Logic**
This cell contains the core logic for the application. It handles:
1.  **Authentication:** Loading API keys securely.
2.  **Ingestion:** Loading text/PDF documents from the data directory.
3.  **Indexing:** Splitting text into chunks and creating vector embeddings.
4.  **Retrieval Chain:** Connecting the Gemini LLM to the Vector Store.
5.  **Testing:** Running a sample query to verify the pipeline works.

In [None]:
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI

GOOGLE_API_KEY = userdata.get('GEMINI_API_KEY')

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=GOOGLE_API_KEY
)

In [None]:
messages = [

    (

        "system",

        "You are a helpful assistant that translates English to French. Translate the user sentence.",

    ),

    ("human", "I love programming."),

]

ai_msg = llm.invoke(messages)

ai_msg

In [None]:
from google.colab import drive
import os

#Mount Google Drive
drive.mount('/content/drive')

#Move to project folder
%cd /content/drive/My Drive/RAG-Chatbot-Project/

#Verification
print("Current folder:", os.getcwd())
print("Files in here:", os.listdir())

In [None]:
DATA_PATH = './data'

# Load documents
loader = DirectoryLoader(DATA_PATH, glob="*.txt", loader_cls=TextLoader)
documents = loader.load()

print(f"Loaded {len(documents)} document(s).")

In [None]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

print(f"Split into {len(chunks)} chunks.")

In [None]:
# Initialize the embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

persist_directory = './chroma_db'

# Create the vector database
vectorstore = Chroma.from_documents(
    chunks,
    embedding_model,
    persist_directory=persist_directory
)

print("Success: Vector store created.")

In [None]:
import os
from google.colab import files
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_classic.memory import ConversationBufferMemory

# 1. Upload a file
print("Please upload a PDF or Text file:")
uploaded = files.upload()

# 2. Process the file
if uploaded:
    for filename in uploaded.keys():
        print(f"\nProcessing {filename}...")

        # Save file temporarily
        file_path = f"./{filename}"
        with open(file_path, "wb") as f:
            f.write(uploaded[filename])

        # Select loader
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            loader = TextLoader(file_path)

        new_docs = loader.load()
        print(f"Loaded {len(new_docs)} pages/documents.")

        # Split text
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        new_chunks = text_splitter.split_documents(new_docs)
        print(f"Split into {len(new_chunks)} chunks.")

        # 3. Add to Database
        vectorstore.add_documents(new_chunks)
        print(f" Successfully added {filename} to the database!")

    print("Refreshing Chatbot Brain...")

    # Define Memory
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key='answer'
    )

    # Build the Conversational Chain
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
        memory=memory,
        return_source_documents=True,
        verbose=False
    )

    print("Chatbot is updated and ready for questions!")
else:
    print("No file uploaded.")

In [None]:
# Question 1: Initial Context
q1 = "What is the Transformer?"
print(f"üë§ User: {q1}")
result1 = qa_chain.invoke({"question": q1})
print(f"ü§ñ Bot: {result1['answer']}\n")

# Question 2: Follow-up (Using "It")
# The bot must know that "It" refers to the Transformer from Q1
q2 = "Does it use recurrent layers?"
print(f"üë§ User: {q2}")
result2 = qa_chain.invoke({"question": q2})
print(f"ü§ñ Bot: {result2['answer']}")

# --- Cite Sources (The Professional Touch) ---
print("\n--- üìÑ Citations ---")
for doc in result2['source_documents']:
    # Get source name and page number if available
    source_name = doc.metadata.get('source', 'Unknown file')
    page_num = doc.metadata.get('page', 'Unknown page')
    print(f"- Found in: {source_name} (Page {page_num})")

In [None]:
%%writefile app.py
import streamlit as st
import os
import tempfile
from google.colab import userdata

# --- Imports ---
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_classic.memory import ConversationBufferMemory
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI

# --- Page Config ---
st.set_page_config(page_title="Gemini RAG Brain", page_icon="üß†", layout="wide")
st.title("üß† Gemini RAG: The Conversational Knowledge Engine")

# --- Initialize Session State ---
if "vectorstore" not in st.session_state:
    st.session_state.vectorstore = None
if "messages" not in st.session_state:
    st.session_state.messages = []
if "memory" not in st.session_state:
    st.session_state.memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key='answer'
    )

# --- Sidebar: API Key & Upload ---
with st.sidebar:
    st.header("‚öôÔ∏è Configuration")

    # 1. Try to get key from Environment (Passed from Colab)
    if "GEMINI_API_KEY" in os.environ:
        os.environ["GOOGLE_API_KEY"] = os.environ["GEMINI_API_KEY"]
        st.success("‚úÖ API Key Loaded (from Env)")

    # 2. Try to get key from Secrets (Direct Access)
    elif 'GEMINI_API_KEY' in userdata.keys():
        os.environ["GOOGLE_API_KEY"] = userdata.get('GEMINI_API_KEY')
        st.success("‚úÖ API Key Loaded (from Secrets)")

    else:
        st.error("‚ö†Ô∏è API Key missing! Add it to Colab Secrets.")

    st.divider()
    st.header("üìÇ Document Management")
    uploaded_files = st.file_uploader("Upload New Documents", type=["pdf", "txt"], accept_multiple_files=True)
    process_btn = st.button("Save & Process Documents")

# --- Processing Logic ---
if process_btn and uploaded_files:
    with st.spinner("Saving to Drive and Processing..."):
        all_documents = []
        DATA_FOLDER = './data'
        os.makedirs(DATA_FOLDER, exist_ok=True)

        for uploaded_file in uploaded_files:
            file_path = os.path.join(DATA_FOLDER, uploaded_file.name)
            with open(file_path, "wb") as f:
                f.write(uploaded_file.getvalue())

            try:
                if uploaded_file.name.endswith(".pdf"):
                    loader = PyPDFLoader(file_path)
                else:
                    loader = TextLoader(file_path)
                all_documents.extend(loader.load())
            except Exception as e:
                st.error(f"Error loading {uploaded_file.name}: {e}")

        if all_documents:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
            chunks = text_splitter.split_documents(all_documents)

            model_name = "sentence-transformers/all-MiniLM-L6-v2"
            embedding_model = HuggingFaceEmbeddings(model_name=model_name)

            persist_directory = './chroma_db_app'

            if st.session_state.vectorstore is None:
                st.session_state.vectorstore = Chroma.from_documents(
                    chunks,
                    embedding_model,
                    persist_directory=persist_directory
                )
            else:
                st.session_state.vectorstore.add_documents(chunks)

            st.success(f"‚úÖ Successfully processed {len(chunks)} new chunks!")
        else:
            st.warning("No valid documents found.")

# --- Chat Logic ---
if st.session_state.vectorstore:
    try:
        llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

        qa_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 3}),
            memory=st.session_state.memory,
            return_source_documents=True,
            verbose=False
        )

        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

        if prompt := st.chat_input("Ask a question..."):
            st.chat_message("user").markdown(prompt)
            st.session_state.messages.append({"role": "user", "content": prompt})

            with st.chat_message("assistant"):
                with st.spinner("Thinking..."):
                    response = qa_chain.invoke({"question": prompt})
                    answer = response['answer']
                    st.markdown(answer)

                    with st.expander("üìö View Sources"):
                        for doc in response['source_documents']:
                            source = doc.metadata.get('source', 'Unknown')
                            st.caption(f"üìÑ **Source:** {os.path.basename(source)}")
                            st.text(doc.page_content[:200] + "...")

                    st.session_state.messages.append({"role": "assistant", "content": answer})
    except Exception as e:
        st.error(f"Error: {e}. Check your API Key.")

else:
    st.info("üëà Upload a document to start chatting!")

In [None]:
# Authenticate
from pyngrok import ngrok
ngrok.set_auth_token("35xV83Xe6GRji7p9E97aaqrSZ8W_2u9SCjWUubCVTMWjAR6Dg")

# 3. Run Streamlit in the background
import subprocess
# We start the app on port 8501
subprocess.Popen(["streamlit", "run", "app.py", "--server.address=0.0.0.0"])

# 4. Open the tunnel to the outside world
public_url = ngrok.connect(8501).public_url
print(f"üöÄ Your Stable App Link: {public_url}")

In [None]:
%%writefile requirements.txt
streamlit
langchain
langchain-community
langchain-google-genai
langchain-text-splitters
langchain-huggingface
chromadb
sentence-transformers
google-generativeai
pypdf
pyngrok