<a href="https://colab.research.google.com/github/kanchanraiii/SecureRag/blob/master/Faiss_gemini_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU "langchain[google-genai]"

In [None]:
# Install all required packages
!pip install -qU langchain-google-genai scikit-learn spacy
!python -m spacy download en_core_web_sm

# Import necessary libraries
import os
import getpass
import re
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

# Set up your Google API Key
if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

print("✅ All libraries are installed and the API key is set.")

In [None]:
!pip install faiss-cpu sentence-transformers


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Import the necessary text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the file path for your uploaded document
file_path = "finance_dataset.jsonl"
chunks = []

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        document_text = f.read()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
    )
    chunks = text_splitter.split_text(document_text)

    print(f"✅ Successfully loaded and chunked '{file_path}'. Found {len(chunks)} chunks.")
    if chunks:
        print("\n--- Sample Chunk 1 ---")
        print(chunks[0])

except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found.")
    print("Please make sure you have uploaded the file and the name is correct.")

In [None]:
class RAGPipeline:
    def __init__(self, embedding_model_name: str, llm_name: str):
        print("--- Initializing Models and Filters ---")
        self.embedding_model = GoogleGenerativeAIEmbeddings(model=embedding_model_name)
        self.llm = ChatGoogleGenerativeAI(model=llm_name)
        self.vector_store = {}

        self.nlp = spacy.load("en_core_web_sm")
        # Regex for PII detection
        self.REGEX_PATTERNS = {
            "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
            "PHONE": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
            "CREDIT_CARD": r"\b(?:\d[ -]*?){13,16}\b",
            "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
        }
        print("--- Models and Filters Initialized ---")

    def _input_filter(self, query: str, threshold: float = 0.5) -> (bool, str):
        """Filters the user's query for sensitive information."""
        doc = self.nlp(query)
        pii_count = sum(1 for ent in doc.ents if ent.label_ in ["PERSON", "GPE", "LOC", "ORG"])
        pii_count += sum(len(re.findall(pattern, query)) for pattern in self.REGEX_PATTERNS.values())

        if len(query.split()) > 0 and pii_count / len(query.split()) > threshold:
            return True, "Query blocked due to high concentration of sensitive information."
        return False, "Query is safe."

    def _output_filter(self, response: str) -> str:
        """Redacts sensitive information from the RAG model's output."""
        doc = self.nlp(response)
        redacted_text = list(response)
        for ent in doc.ents:
            if ent.label_ in ["PERSON", "GPE", "LOC", "ORG"]:
                start, end = ent.start_char, ent.end_char
                redacted_text[start:end] = f"[{ent.label_}]"
        redacted_text = "".join(redacted_text)
        for pii_type, pattern in self.REGEX_PATTERNS.items():
            redacted_text = re.sub(pattern, f"[{pii_type}]", redacted_text)
        return redacted_text

    def build_vector_store(self, text_chunks: list, batch_size: int = 100):
        """Builds the vector store by embedding chunks in smaller, trackable batches."""
        total_chunks = len(text_chunks)
        print(f"\n--- Building Vector Store for {total_chunks} chunks ---")
        for i in range(0, total_chunks, batch_size):
            batch_chunks = text_chunks[i:i + batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch_chunks)
            for j, (chunk, embedding) in enumerate(zip(batch_chunks, batch_embeddings)):
                self.vector_store[i + j] = {"text": chunk, "embedding": np.array(embedding).reshape(1, -1)}
            print(f"Processed {min(i + batch_size, total_chunks)} / {total_chunks} chunks...")
        print("--- Vector Store Created ---\n")

    def _get_single_embedding(self, text: str) -> np.ndarray:
        embedding = self.embedding_model.embed_query(text)
        return np.array(embedding).reshape(1, -1)

    def run_query(self, query: str):
        print(f"Processing query: '{query}'")
        print("="*30)

        # Call the internal input filter
        is_sensitive, message = self._input_filter(query)
        if is_sensitive:
            print(f"Input Filter Action: {message}")
            return

        query_embedding = self._get_single_embedding(query)

        similarities = []
        for i, data in self.vector_store.items():
            sim = cosine_similarity(query_embedding, data["embedding"])[0][0]
            similarities.append((sim, data["text"]))
        similarities.sort(key=lambda x: x[0], reverse=True)
        context = [text for sim, text in similarities[:2]]

        print("\n--- Retrieved Context ---")
        for c in context:
            print(f"- {c[:150]}...")
        print("-" * 25, "\n")

        context_str = "\n".join(context)
        prompt = f"Context:\n{context_str}\n\nQuery: {query}\n\nAnswer:"
        result = self.llm.invoke(prompt)

        print(f"\nOriginal Generated Response: '{result.content}'")
        # Call the internal output filter
        safe_response = self._output_filter(result.content)
        print(f"Final (Redacted) Response: '{safe_response}'\n")

print("✅ RAG Pipeline class is defined with integrated filters.")

In [None]:
# Make sure chunks were loaded successfully in Cell 3
if 'chunks' in locals() and chunks:
    # 1. Initialize the pipeline
    pipeline = RAGPipeline(
        embedding_model_name="models/text-embedding-004",
        llm_name="gemini-1.5-flash"
    )

    # 2. Build the vector store using the chunks from Cell 3
    pipeline.build_vector_store(chunks)

    # 3. Run a test query
    # Replace this with any query relevant to your document
    test_query = "Who is Teerth Badal?"

    # --- THIS IS THE CORRECTED LINE ---
    pipeline.run_query(test_query)

else:
    print("❌ Cannot run the pipeline because no chunks were loaded.")